full_data <- readRDS('data/full_data.rds')
daily_full <- readRDS('data/daily_full.rds')

Explore some GAM models

Five minute H2S

h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.172e+00  3.213e-03  364.72   <2e-16 ***
## year2023       -3.263e-01  2.800e-03 -116.54   <2e-16 ***
## wd_secQ2       -2.560e-01  3.751e-03  -68.23   <2e-16 ***
## wd_secQ3       -2.898e-01  3.892e-03  -74.46   <2e-16 ***
## wd_secQ4       -2.095e-01  3.492e-03  -59.99   <2e-16 ***
## ws             -5.424e-02  4.176e-04 -129.91   <2e-16 ***
## I(1/MinDist^2)  2.114e+05  2.329e+03   90.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.996      8 2496  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0946   Deviance explained = 9.47%
## GCV = 0.92965  Scale est. = 0.92963   n = 772251
plot(h2s_model_a)

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4627048 247.2    8310081  443.9   7027379  375.4
## Vcells 104359554 796.3  271985254 2075.1 226394449 1727.3
h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
                   s(mon_utm_x, mon_utm_y, bs='tp', k = 8), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) + 
##     s(mon_utm_x, mon_utm_y, bs = "tp", k = 8)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.013e+00  3.170e-03  319.43   <2e-16 ***
## wd_secQ2       -1.953e-01  3.695e-03  -52.86   <2e-16 ***
## wd_secQ3       -1.816e-01  3.905e-03  -46.51   <2e-16 ***
## wd_secQ4       -1.131e-01  3.486e-03  -32.43   <2e-16 ***
## ws             -6.919e-02  4.146e-04 -166.88   <2e-16 ***
## I(1/MinDist^2)  2.983e+05  2.856e+03  104.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df    F p-value    
## s(as.numeric(month))   7.999      8 1710  <2e-16 ***
## s(mon_utm_x,mon_utm_y) 6.999      7 6572  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.131   Deviance explained = 13.1%
## GCV = 0.89282  Scale est. = 0.8928    n = 772251
plot(h2s_model_b)

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4640722 247.9    8310081  443.9   7027379  375.4
## Vcells 114876122 876.5  326462304 2490.8 325456152 2483.1
# Include converted angle and weekday
h2s_model_c <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + wd_sec + ws + 
                     I(1/MinDist^2) + Converted_Angle + 
                     s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
                     as.factor(weekday), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2) + Converted_Angle + s(mon_utm_x, mon_utm_y, 
##     bs = "tp", k = 10) + as.factor(weekday)
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.494e+00  2.321e-02  107.466  < 2e-16 ***
## year2023             -3.331e-01  2.866e-03 -116.237  < 2e-16 ***
## wd_secQ2             -1.863e-01  3.645e-03  -51.123  < 2e-16 ***
## wd_secQ3             -1.914e-01  3.848e-03  -49.753  < 2e-16 ***
## wd_secQ4             -1.096e-01  3.443e-03  -31.842  < 2e-16 ***
## ws                   -6.123e-02  4.080e-04 -150.068  < 2e-16 ***
## I(1/MinDist^2)       -3.291e-05  3.304e-07  -99.583  < 2e-16 ***
## Converted_Angle      -6.125e-03  1.104e-04  -55.496  < 2e-16 ***
## as.factor(weekday).L  9.861e-02  2.802e-03   35.190  < 2e-16 ***
## as.factor(weekday).Q -1.699e-01  2.810e-03  -60.461  < 2e-16 ***
## as.factor(weekday).C -3.825e-03  2.805e-03   -1.364    0.173    
## as.factor(weekday)^4 -2.049e-02  2.815e-03   -7.277 3.41e-13 ***
## as.factor(weekday)^5 -5.801e-02  2.809e-03  -20.654  < 2e-16 ***
## as.factor(weekday)^6 -2.508e-02  2.821e-03   -8.889  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df    F p-value    
## s(as.numeric(month))   7.998      8 2704  <2e-16 ***
## s(mon_utm_x,mon_utm_y) 8.999      9 6486  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 30/31
## R-sq.(adj) =  0.155   Deviance explained = 15.6%
## GCV = 0.86717  Scale est. = 0.86714   n = 772251
plot(h2s_model_c)

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4642589 248.0    8310081  443.9   7027379  375.4
## Vcells 126899759 968.2  391834764 2989.5 379580506 2896.0
# Include converted angle and weekday
h2s_model_d <- gam(H2S ~ 
                     s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                     wd_sec + ws + downwind_ref + downwind_wrp +
                     I(1/MinDist^2) + dist_wrp + capacity +
                     Converted_Angle + 
                     s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
                     monthly_oil_1km + monthly_gas_1km + active_1km
                     , 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     dist_wrp + capacity + Converted_Angle + s(mon_utm_x, mon_utm_y, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          -2.598e+00  1.076e-01  -24.142  < 2e-16 ***
## year2023             -1.204e-01  6.381e-03  -18.862  < 2e-16 ***
## as.factor(weekday).L  1.056e-01  2.772e-03   38.089  < 2e-16 ***
## as.factor(weekday).Q -1.728e-01  2.779e-03  -62.180  < 2e-16 ***
## as.factor(weekday).C -6.411e-03  2.773e-03   -2.312   0.0208 *  
## as.factor(weekday)^4 -1.711e-02  2.783e-03   -6.146 7.94e-10 ***
## as.factor(weekday)^5 -5.627e-02  2.776e-03  -20.268  < 2e-16 ***
## as.factor(weekday)^6 -2.645e-02  2.788e-03   -9.486  < 2e-16 ***
## wd_secQ2             -1.503e-01  3.698e-03  -40.648  < 2e-16 ***
## wd_secQ3             -1.289e-01  3.923e-03  -32.865  < 2e-16 ***
## wd_secQ4             -7.449e-02  3.472e-03  -21.453  < 2e-16 ***
## ws                   -6.841e-02  4.074e-04 -167.932  < 2e-16 ***
## downwind_ref          1.374e-01  3.901e-03   35.229  < 2e-16 ***
## downwind_wrp         -1.702e-02  4.304e-03   -3.954 7.69e-05 ***
## I(1/MinDist^2)        2.788e-05  1.646e-06   16.943  < 2e-16 ***
## dist_wrp              3.799e-04  8.572e-06   44.322  < 2e-16 ***
## capacity              6.154e-03  8.181e-05   75.220  < 2e-16 ***
## Converted_Angle      -2.943e-03  1.099e-04  -26.770  < 2e-16 ***
## monthly_oil_1km       6.223e-05  2.765e-06   22.506  < 2e-16 ***
## monthly_gas_1km       1.292e-04  1.243e-05   10.390  < 2e-16 ***
## active_1km           -2.572e-02  1.946e-03  -13.218  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf Ref.df    F p-value    
## s(as.numeric(month))   7.989  8.000 2000  <2e-16 ***
## s(mon_utm_x,mon_utm_y) 8.008  8.008 5964  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 36/38
## R-sq.(adj) =  0.175   Deviance explained = 17.6%
## GCV = 0.84667  Scale est. = 0.84663   n = 772251
plot(h2s_model_d)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4643484  248.0    8310081  443.9   7027379  375.4
## Vcells 143170798 1092.4  470281716 3588.0 391694254 2988.4
# Include elvation, EVI, 3D smooth, odor, dist to dom channel, temp, hum, precip
h2s_model_e <- readRDS('rfiles/h2s_model_e.rds')
# h2s_model_e <- gam(H2S ~ 
#                      s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
#                      wd_sec + ws + downwind_ref + downwind_wrp +
#                      I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
#                      Converted_Angle + elevation + EVI + 
#                      s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
#                      te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                      monthly_oil_1km + monthly_gas_1km + active_1km +
#                      num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip
#                      , 
#                    data = full_data %>% filter(day >= '2022-02-01'))
#saveRDS(h2s_model_e, 'rfiles/h2s_model_e.rds')
summary(h2s_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.285e+00  6.510e-01    3.510 0.000449 ***
## year2023              9.870e-02  1.712e-02    5.766 8.12e-09 ***
## as.factor(weekday).L  1.129e-01  2.852e-03   39.594  < 2e-16 ***
## as.factor(weekday).Q -1.747e-01  2.841e-03  -61.484  < 2e-16 ***
## as.factor(weekday).C  1.584e-02  2.824e-03    5.609 2.03e-08 ***
## as.factor(weekday)^4 -1.810e-02  2.828e-03   -6.400 1.55e-10 ***
## as.factor(weekday)^5 -5.583e-02  2.808e-03  -19.883  < 2e-16 ***
## as.factor(weekday)^6 -2.485e-02  2.822e-03   -8.806  < 2e-16 ***
## wd_secQ2             -1.031e-01  3.741e-03  -27.569  < 2e-16 ***
## wd_secQ3             -1.454e-01  3.962e-03  -36.702  < 2e-16 ***
## wd_secQ4             -1.146e-01  3.490e-03  -32.841  < 2e-16 ***
## ws                   -6.036e-02  4.158e-04 -145.169  < 2e-16 ***
## downwind_ref          1.098e-01  3.923e-03   28.001  < 2e-16 ***
## downwind_wrp          8.079e-03  4.326e-03    1.868 0.061802 .  
## I(1/MinDist^2)        1.707e-05  6.264e-06    2.726 0.006420 ** 
## I(1/dist_wrp^2)       1.056e-06  3.794e-07    2.783 0.005394 ** 
## capacity              7.434e-03  1.338e-03    5.556 2.75e-08 ***
## wrp_angle             4.063e-03  9.670e-04    4.201 2.65e-05 ***
## Converted_Angle      -1.586e-02  2.157e-03   -7.352 1.95e-13 ***
## elevation            -2.059e-01  1.293e-02  -15.920  < 2e-16 ***
## EVI                   2.975e+00  6.113e-01    4.868 1.13e-06 ***
## monthly_oil_1km       1.856e-04  6.710e-06   27.663  < 2e-16 ***
## monthly_gas_1km      -4.097e-04  3.277e-05  -12.504  < 2e-16 ***
## active_1km           -3.057e-02  2.530e-03  -12.081  < 2e-16 ***
## num_odor_complaints   2.821e-02  1.917e-03   14.718  < 2e-16 ***
## I(1/dist_dc^2)        1.019e-04  9.015e-06   11.303  < 2e-16 ***
## avg_temp              9.080e-03  4.090e-04   22.202  < 2e-16 ***
## avg_hum              -7.505e-03  1.056e-04  -71.073  < 2e-16 ***
## precip               -8.285e-02  5.145e-03  -16.102  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df       F
## s(as.numeric(month))                                     7.994  8.000 310.126
## s(mon_utm_x,mon_utm_y)                                   1.865  1.865   0.459
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 81.249 81.253 899.338
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(mon_utm_x,mon_utm_y)                                    0.542    
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/135
## R-sq.(adj) =  0.255   Deviance explained = 25.5%
## GCV = 0.80029  Scale est. = 0.80016   n = 712259
plot(h2s_model_e)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4651247  248.5    8310081  443.9   7027379  375.4
## Vcells 181301887 1383.3  470281716 3588.0 391694254 2988.4
# Model only the month of the event, October 2021
h2s_model_f <- readRDS('rfiles/h2s_model_f.rds')
# h2s_model_f <- gam(H2S ~ wd_sec + ws + downwind_ref + downwind_wrp +
#                      I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
#                      Converted_Angle + elevation + EVI + 
#                      s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
#                      te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                      monthly_oil_1km + monthly_gas_1km + active_1km +
#                      num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
#                    data = full_data %>% filter(year == '2021', month == '10'))
# saveRDS(h2s_model_f, 'rfiles/h2s_model_f.rds')
summary(h2s_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.099e-07  4.542e-07   2.003  0.04516 *  
## wd_secQ2             3.129e+00  9.512e-01   3.289  0.00101 ** 
## wd_secQ3             6.611e+00  9.439e-01   7.004 2.50e-12 ***
## wd_secQ4             7.882e+00  8.910e-01   8.845  < 2e-16 ***
## ws                  -2.321e+00  1.159e-01 -20.026  < 2e-16 ***
## downwind_ref         1.639e+00  8.589e-01   1.908  0.05642 .  
## downwind_wrp         1.494e+00  1.077e+00   1.388  0.16529    
## I(1/MinDist^2)      -4.561e-03  1.475e-03  -3.093  0.00198 ** 
## I(1/dist_wrp^2)     -9.325e-04  5.499e-05 -16.959  < 2e-16 ***
## capacity             7.671e-01  1.692e-01   4.533 5.82e-06 ***
## wrp_angle            8.449e-01  1.825e-01   4.629 3.68e-06 ***
## Converted_Angle     -2.556e+00  3.095e-01  -8.258  < 2e-16 ***
## elevation           -2.652e+01  2.679e+00  -9.899  < 2e-16 ***
## EVI                  8.337e+02  1.155e+02   7.220 5.23e-13 ***
## monthly_oil_1km      1.533e-02  7.959e-03   1.926  0.05412 .  
## monthly_gas_1km      2.295e-03  1.205e-03   1.905  0.05680 .  
## active_1km           3.981e-05  2.000e-05   1.991  0.04647 *  
## num_odor_complaints  8.458e-01  6.767e-02  12.498  < 2e-16 ***
## I(1/dist_dc^2)       7.748e+00  3.991e-01  19.415  < 2e-16 ***
## avg_temp             1.349e+00  1.864e-01   7.236 4.66e-13 ***
## avg_hum              1.629e-01  5.258e-02   3.099  0.00194 ** 
## precip              -4.019e+01  7.628e+00  -5.269 1.37e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(mon_utm_x,mon_utm_y)                                   2.017   2.02  4.625
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 84.337  84.58 61.960
##                                                         p-value    
## s(mon_utm_x,mon_utm_y)                                  0.00868 ** 
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 108/120
## R-sq.(adj) =  0.265   Deviance explained = 26.5%
## GCV = 5448.6  Scale est. = 5441.4    n = 77510
plot(h2s_model_f)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4652770  248.5    8310081  443.9   7027379  375.4
## Vcells 185288289 1413.7  470281716 3588.0 391694254 2988.4
# Model data excluding the months of the event
h2s_model_g <- readRDS('rfiles/h2s_model_g.rds')
# h2s_model_g <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
#                      wd_sec + ws + downwind_ref + downwind_wrp +
#                      I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
#                      Converted_Angle + elevation + EVI + 
#                      s(mon_utm_x, mon_utm_y, bs='tp', k = 10) + 
#                      te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                      monthly_oil_1km + monthly_gas_1km + active_1km +
#                      num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip
#                      , 
#                    data = full_data %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
# saveRDS(h2s_model_g, 'rfiles/h2s_model_g.rds')
summary(h2s_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle + 
##     elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           5.539e+00  3.610e-01   15.343  < 2e-16 ***
## year2021              3.707e-01  1.375e-02   26.967  < 2e-16 ***
## year2022             -5.225e-01  2.573e-02  -20.307  < 2e-16 ***
## year2023             -6.492e-01  2.547e-02  -25.492  < 2e-16 ***
## as.factor(weekday).L  8.394e-02  2.184e-03   38.426  < 2e-16 ***
## as.factor(weekday).Q -1.356e-01  2.182e-03  -62.151  < 2e-16 ***
## as.factor(weekday).C  1.406e-03  2.184e-03    0.644   0.5197    
## as.factor(weekday)^4 -2.889e-02  2.188e-03  -13.204  < 2e-16 ***
## as.factor(weekday)^5 -1.732e-02  2.181e-03   -7.943 1.98e-15 ***
## as.factor(weekday)^6 -5.050e-03  2.184e-03   -2.312   0.0208 *  
## wd_secQ2             -1.049e-01  2.952e-03  -35.543  < 2e-16 ***
## wd_secQ3             -1.878e-01  2.958e-03  -63.463  < 2e-16 ***
## wd_secQ4             -1.651e-01  2.796e-03  -59.057  < 2e-16 ***
## ws                   -4.613e-02  3.094e-04 -149.116  < 2e-16 ***
## downwind_ref          3.540e-02  2.614e-03   13.545  < 2e-16 ***
## downwind_wrp          2.151e-02  3.270e-03    6.577 4.80e-11 ***
## I(1/MinDist^2)       -4.864e-05  8.792e-06   -5.533 3.15e-08 ***
## I(1/dist_wrp^2)       1.483e-06  9.456e-07    1.568   0.1168    
## capacity              5.414e-03  5.868e-04    9.225  < 2e-16 ***
## wrp_angle             3.176e-03  7.460e-04    4.258 2.06e-05 ***
## Converted_Angle      -1.473e-02  1.020e-03  -14.451  < 2e-16 ***
## elevation            -1.200e-01  9.824e-03  -12.215  < 2e-16 ***
## EVI                   2.459e+00  4.676e-01    5.259 1.44e-07 ***
## monthly_oil_1km      -2.575e-05  2.232e-06  -11.534  < 2e-16 ***
## monthly_gas_1km      -1.831e-04  9.137e-06  -20.036  < 2e-16 ***
## active_1km           -3.881e-02  9.430e-04  -41.155  < 2e-16 ***
## num_odor_complaints   2.632e-02  1.466e-03   17.949  < 2e-16 ***
## I(1/dist_dc^2)        2.766e-03  3.971e-03    0.696   0.4861    
## avg_temp              6.183e-03  2.696e-04   22.937  < 2e-16 ***
## avg_hum              -6.301e-03  7.620e-05  -82.688  < 2e-16 ***
## precip               -7.811e-02  5.252e-03  -14.872  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.999  8.000 2547.0
## s(mon_utm_x,mon_utm_y)                                   1.682  1.683  246.9
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 85.198 85.296 1455.0
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(mon_utm_x,mon_utm_y)                                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 129/137
## R-sq.(adj) =  0.166   Deviance explained = 16.6%
## GCV = 1.1485  Scale est. = 1.1484    n = 1696814
plot(h2s_model_g)

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4655246  248.7    8310081  443.9   7027379  375.4
## Vcells 275579936 2102.6  470281716 3588.0 391694254 2988.4

Daily max H2S

# With month, year, wind sector/speed, dist to refinery, refinery
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_avg+ws_avg+
                        I(1/MinDist^2), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_avg + 
##     ws_avg + I(1/MinDist^2)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.031e+00  3.477e-01   8.717   <2e-16 ***
## year2023       -4.346e-01  2.197e-01  -1.978    0.048 *  
## wd_avg          6.613e-04  1.035e-03   0.639    0.523    
## ws_avg         -8.840e-02  6.339e-02  -1.394    0.163    
## I(1/MinDist^2)  7.658e-07  8.232e-08   9.303   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F  p-value    
## s(as.numeric(month)) 2.184      8 2.026 6.47e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 12/13
## R-sq.(adj) =  0.00874   Deviance explained = 1.07%
## GCV = 21.457  Scale est. = 21.407    n = 2664
plot(h2s_dm_model_a)

# Also with location of monitor
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Refinery+s(monitor_lat, monitor_lon, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Refinery + s(monitor_lat, monitor_lon, bs = "tp", 
##     k = 10)
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       9.418e+00  1.420e+01   0.663  0.50715    
## wd_avg                            3.025e-03  1.010e-03   2.994  0.00278 ** 
## ws_avg                           -3.111e-01  6.333e-02  -4.913 9.54e-07 ***
## I(1/MinDist^2)                   -6.123e-05  1.635e-03  -0.037  0.97012    
## RefineryMarathon (Carson)        -9.065e-01  1.761e+01  -0.051  0.95894    
## RefineryMarathon (Wilmington)    -3.737e+00  1.745e+01  -0.214  0.83047    
## RefineryPhillips 66 (Wilmington) -1.409e+01  1.626e+01  -0.866  0.38643    
## RefineryTorrance Refinery        -2.578e+00  1.400e+01  -0.184  0.85387    
## RefineryValero Refinery          -7.568e+00  1.766e+01  -0.429  0.66824    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F  p-value    
## s(as.numeric(month))       2.035  8.000 1.723 0.000203 ***
## s(monitor_lat,monitor_lon) 5.379  5.672 9.312 1.13e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 25/26
## R-sq.(adj) =  0.127   Deviance explained = 13.2%
## GCV = 18.968  Scale est. = 18.858    n = 2664
plot(h2s_dm_model_b)

# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Converted_Angle+
                        s(monitor_lat, monitor_lon, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Converted_Angle + s(monitor_lat, monitor_lon, 
##     bs = "tp", k = 10)
## 
## Parametric coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.608e+00  8.828e-01   4.087  4.5e-05 ***
## wd_avg           2.924e-03  1.015e-03   2.882 0.003983 ** 
## ws_avg          -2.235e-01  6.205e-02  -3.602 0.000322 ***
## I(1/MinDist^2)  -8.626e-06  3.814e-05  -0.226 0.821082    
## Converted_Angle -3.444e-03  3.892e-03  -0.885 0.376317    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F  p-value    
## s(as.numeric(month))       2.113  8.000  2.042 4.88e-05 ***
## s(monitor_lat,monitor_lon) 7.071  7.919 41.137  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 21/22
## R-sq.(adj) =  0.117   Deviance explained = 12.1%
## GCV = 19.169  Scale est. = 19.074    n = 2664
plot(h2s_dm_model_c)

h2s_dm_model_d <- gam(H2S_daily_max ~ 
                         s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + dist_wrp + capacity +
                         Converted_Angle + 
                         s(monitor_lat, monitor_lon, bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km
                      , 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + dist_wrp + 
##     capacity + Converted_Angle + s(monitor_lat, monitor_lon, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.696e+01  5.143e+00  -3.298 0.000986 ***
## year2023             -6.419e-01  4.082e-01  -1.573 0.115927    
## as.factor(weekday).L  4.659e-01  2.216e-01   2.103 0.035589 *  
## as.factor(weekday).Q -9.062e-01  2.223e-01  -4.077  4.7e-05 ***
## as.factor(weekday).C  8.727e-03  2.215e-01   0.039 0.968577    
## as.factor(weekday)^4 -1.728e-01  2.223e-01  -0.777 0.437033    
## as.factor(weekday)^5 -2.848e-01  2.217e-01  -1.285 0.198943    
## as.factor(weekday)^6 -1.696e-01  2.223e-01  -0.763 0.445474    
## wd_avg                3.371e-03  1.028e-03   3.279 0.001057 ** 
## ws_avg               -3.041e-01  6.608e-02  -4.601  4.4e-06 ***
## daily_downwind_ref    7.377e-01  3.989e-01   1.849 0.064527 .  
## I(1/MinDist^2)        2.246e-04  3.883e-04   0.578 0.563013    
## dist_wrp              2.258e-03  6.614e-04   3.415 0.000648 ***
## capacity              9.466e-03  1.166e-02   0.812 0.416776    
## Converted_Angle       9.685e-03  8.699e-03   1.113 0.265657    
## monthly_oil_1km       2.479e-04  1.548e-04   1.602 0.109297    
## monthly_gas_1km      -1.138e-03  6.554e-04  -1.736 0.082603 .  
## active_1km            3.974e-02  6.286e-02   0.632 0.527264    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F  p-value    
## s(as.numeric(month))       2.319  8.000  2.087 6.51e-05 ***
## s(monitor_lat,monitor_lon) 7.785  7.976 14.036  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 34/35
## R-sq.(adj) =  0.137   Deviance explained = 14.5%
## GCV = 18.829  Scale est. = 18.638    n = 2664
plot(h2s_dm_model_d)

# Try to include as many variables as possible
h2s_dm_model_e <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                        capacity + daily_downwind_wrp + 
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        elevation + EVI + num_odor_complaints + 
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(day >= '2022-02-01'))

summary(h2s_dm_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km + 
##     elevation + EVI + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -1.184e+01  7.409e+00  -1.598  0.11020    
## year2023                  2.174e-01  1.024e+00   0.212  0.83183    
## as.character(weekday)Mon -6.411e-01  3.101e-01  -2.067  0.03882 *  
## as.character(weekday)Sat -7.266e-01  3.099e-01  -2.344  0.01913 *  
## as.character(weekday)Sun -1.279e+00  3.102e-01  -4.123 3.86e-05 ***
## as.character(weekday)Thu -2.864e-01  3.102e-01  -0.923  0.35608    
## as.character(weekday)Tue -1.762e-01  3.078e-01  -0.572  0.56720    
## as.character(weekday)Wed -4.809e-02  3.118e-01  -0.154  0.87745    
## wd_avg                    2.344e-03  1.084e-03   2.161  0.03078 *  
## ws_avg                   -2.294e-01  7.129e-02  -3.217  0.00131 ** 
## daily_downwind_ref        4.051e-01  3.944e-01   1.027  0.30441    
## I(1/dist_wrp^2)           4.422e-07  1.655e-06   0.267  0.78930    
## capacity                 -5.974e-03  2.648e-03  -2.256  0.02415 *  
## daily_downwind_wrp        1.783e-01  4.040e-01   0.441  0.65903    
## monthly_oil_1km           5.216e-04  2.741e-04   1.903  0.05714 .  
## monthly_gas_1km          -1.588e-03  1.987e-03  -0.799  0.42419    
## active_1km                2.959e-01  1.237e-01   2.393  0.01680 *  
## elevation                -1.318e-01  5.729e-02  -2.301  0.02146 *  
## EVI                      -3.691e+00  7.407e-01  -4.983 6.68e-07 ***
## num_odor_complaints       1.128e-01  1.520e-01   0.742  0.45800    
## I(1/dist_dc^2)            3.870e-05  7.395e-05   0.523  0.60085    
## avg_temp                  6.252e-02  2.837e-02   2.204  0.02763 *  
## avg_hum                  -1.857e-02  7.398e-03  -2.511  0.01212 *  
## precip                    4.781e-01  4.334e-01   1.103  0.27009    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    3.379e-09  8.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  4.037e+00  4.451 3.041
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 5.550e+01 76.000 1.469
##                                                          p-value    
## s(as.numeric(month))                                     0.85254    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   0.00945 ** 
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 3.23e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 115/117
## R-sq.(adj) =  0.161   Deviance explained = 18.7%
## GCV = 18.681  Scale est. = 18.109    n = 2664
plot(h2s_dm_model_e)

e <- getViz(h2s_dm_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_f <- gam(H2S_daily_max ~ as.character(weekday) +
                        wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                        capacity + daily_downwind_wrp + 
                        s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(year == '2021', month %in% c('10', '11', '12')))

summary(h2s_dm_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ as.character(weekday) + wd_avg + ws_avg + daily_downwind_ref + 
##     I(1/dist_wrp^2) + capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km + elevation + EVI + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -2.933615   1.112132  -2.638  0.00852 ** 
## as.character(weekday)Mon   3.939302  25.809815   0.153  0.87873    
## as.character(weekday)Sat  -1.274518  24.967489  -0.051  0.95930    
## as.character(weekday)Sun   6.518458  25.156102   0.259  0.79561    
## as.character(weekday)Thu  10.719203  25.476323   0.421  0.67406    
## as.character(weekday)Tue -12.039497  26.266684  -0.458  0.64683    
## as.character(weekday)Wed   9.587227  25.907187   0.370  0.71144    
## wd_avg                     0.079809   0.082575   0.966  0.33411    
## ws_avg                     0.102886   6.616534   0.016  0.98760    
## daily_downwind_ref       -30.773112  24.395106  -1.261  0.20754    
## I(1/dist_wrp^2)           -0.001388   0.000147  -9.442  < 2e-16 ***
## capacity                   6.856419   0.710540   9.650  < 2e-16 ***
## daily_downwind_wrp         8.299696  29.553392   0.281  0.77891    
## monthly_oil_1km            0.053059   0.115651   0.459  0.64652    
## monthly_gas_1km           -0.001199   0.314860  -0.004  0.99696    
## active_1km               -96.629305  36.632200  -2.638  0.00852 ** 
## elevation                 24.515412   7.897485   3.104  0.00198 ** 
## EVI                      804.034316 104.569368   7.689 4.69e-14 ***
## num_odor_complaints        3.663308   0.869852   4.211 2.85e-05 ***
## I(1/dist_dc^2)             9.401428   0.971411   9.678  < 2e-16 ***
## avg_temp                   2.543734   2.454075   1.037  0.30029    
## avg_hum                    0.217984   0.586008   0.372  0.71001    
## precip                    -6.489537  26.367207  -0.246  0.80566    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.929   8.99 12.175
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 52.052  80.00  4.366
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 109/112
## R-sq.(adj) =  0.511   Deviance explained = 55.8%
## GCV =  37085  Scale est. = 33454     n = 827
plot(h2s_dm_model_f)

f <- getViz(h2s_dm_model_f)
plot(sm(f, 1)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_g <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                        wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                        capacity + daily_downwind_wrp +
                        s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                        monthly_oil_1km + monthly_gas_1km + active_1km + 
                        elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))

summary(h2s_dm_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp", 
##         "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km + 
##     elevation + EVI + num_odor_complaints + I(1/dist_dc^2) + 
##     avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.916e+00  5.515e+00   0.529 0.597053    
## year2021                  8.579e-01  9.300e-01   0.922 0.356332    
## year2022                 -2.466e-01  1.422e+00  -0.173 0.862369    
## year2023                 -5.194e-01  1.452e+00  -0.358 0.720584    
## as.character(weekday)Mon -3.061e-01  2.768e-01  -1.106 0.268828    
## as.character(weekday)Sat -5.006e-01  2.749e-01  -1.821 0.068640 .  
## as.character(weekday)Sun -8.683e-01  2.753e-01  -3.153 0.001622 ** 
## as.character(weekday)Thu -4.797e-02  2.758e-01  -0.174 0.861931    
## as.character(weekday)Tue  2.499e-01  2.750e-01   0.909 0.363486    
## as.character(weekday)Wed  4.445e-02  2.770e-01   0.160 0.872503    
## wd_avg                    1.333e-03  1.003e-03   1.329 0.183975    
## ws_avg                   -2.430e-01  6.243e-02  -3.893 0.000100 ***
## daily_downwind_ref        1.440e-01  2.800e-01   0.514 0.607101    
## I(1/dist_wrp^2)          -3.044e-06  6.687e-07  -4.553 5.40e-06 ***
## capacity                  3.387e-04  6.443e-03   0.053 0.958080    
## daily_downwind_wrp       -5.676e-03  3.347e-01  -0.017 0.986471    
## monthly_oil_1km           9.574e-05  1.675e-04   0.572 0.567570    
## monthly_gas_1km           1.013e-04  6.476e-04   0.156 0.875724    
## active_1km                1.156e-01  5.775e-02   2.002 0.045331 *  
## elevation                -2.426e-01  6.877e-02  -3.528 0.000421 ***
## EVI                      -4.221e+00  1.023e+00  -4.125 3.76e-05 ***
## num_odor_complaints       3.987e-01  1.295e-01   3.079 0.002084 ** 
## I(1/dist_dc^2)            1.558e-03  1.652e-03   0.943 0.345664    
## avg_temp                  1.508e-03  2.045e-02   0.074 0.941227    
## avg_hum                  -2.471e-02  6.150e-03  -4.019 5.93e-05 ***
## precip                    4.173e-01  5.063e-01   0.824 0.409849    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                               edf Ref.df     F
## s(as.numeric(month))                                    1.256e-08  8.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  7.759e+00  8.001 4.529
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 4.023e+01 80.000 1.062
##                                                          p-value    
## s(as.numeric(month))                                       0.517    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  1.53e-05 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 9.00e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =  0.0679   Deviance explained = 7.87%
## GCV = 33.746  Scale est. = 33.352    n = 6162
plot(h2s_dm_model_g)

g <- getViz(h2s_dm_model_g)
plot(sm(g, 2)) + l_fitRaster() + l_fitContour() + l_points()

Daily Average

# Look at daily average
h2s_da_model_a <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
                         Converted_Angle + 
                         s(monitor_lon, monitor_lat, bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + Converted_Angle + s(monitor_lon, monitor_lat, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4.742e+00  9.324e-01  -5.086 3.91e-07 ***
## year2023             -1.515e-01  5.680e-02  -2.667 0.007704 ** 
## as.factor(weekday).L  8.212e-02  2.360e-02   3.480 0.000509 ***
## as.factor(weekday).Q -1.599e-01  2.367e-02  -6.755 1.75e-11 ***
## as.factor(weekday).C  1.925e-02  2.357e-02   0.817 0.414075    
## as.factor(weekday)^4 -6.537e-03  2.364e-02  -0.277 0.782124    
## as.factor(weekday)^5 -4.980e-02  2.357e-02  -2.113 0.034702 *  
## as.factor(weekday)^6 -4.449e-02  2.363e-02  -1.883 0.059767 .  
## wd_avg                7.982e-04  1.096e-04   7.286 4.19e-13 ***
## ws_avg               -1.139e-01  7.202e-03 -15.809  < 2e-16 ***
## daily_downwind_ref    2.721e-01  4.253e-02   6.399 1.85e-10 ***
## I(1/MinDist^2)        2.940e-04  3.325e-05   8.844  < 2e-16 ***
## I(1/dist_wrp^2)      -1.625e-05  2.391e-06  -6.798 1.31e-11 ***
## capacity              1.565e-02  1.127e-03  13.885  < 2e-16 ***
## Converted_Angle      -2.730e-03  9.770e-04  -2.795 0.005234 ** 
## monthly_oil_1km       6.935e-05  2.246e-05   3.087 0.002043 ** 
## monthly_gas_1km       4.867e-05  1.016e-04   0.479 0.631782    
## active_1km           -2.583e-02  1.482e-02  -1.743 0.081492 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F p-value    
## s(as.numeric(month))       7.557      8 19.58  <2e-16 ***
## s(monitor_lon,monitor_lat) 8.988      9 85.28  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 33/35
## R-sq.(adj) =  0.421   Deviance explained = 42.8%
## GCV = 0.21298  Scale est. = 0.21037   n = 2664
plot(h2s_da_model_a)

a <- getViz(h2s_da_model_a)
plot(sm(a, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Look at daily average
h2s_da_model_b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
                         Converted_Angle + 
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + Converted_Angle + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.291e+00  8.757e-01  -3.758 0.000175 ***
## year2023             -1.515e-01  5.681e-02  -2.667 0.007708 ** 
## as.factor(weekday).L  8.212e-02  2.360e-02   3.480 0.000509 ***
## as.factor(weekday).Q -1.599e-01  2.367e-02  -6.755 1.76e-11 ***
## as.factor(weekday).C  1.925e-02  2.357e-02   0.817 0.414145    
## as.factor(weekday)^4 -6.549e-03  2.364e-02  -0.277 0.781748    
## as.factor(weekday)^5 -4.978e-02  2.357e-02  -2.112 0.034749 *  
## as.factor(weekday)^6 -4.449e-02  2.363e-02  -1.883 0.059817 .  
## wd_avg                7.982e-04  1.095e-04   7.286 4.20e-13 ***
## ws_avg               -1.138e-01  7.202e-03 -15.803  < 2e-16 ***
## daily_downwind_ref    2.721e-01  4.252e-02   6.400 1.84e-10 ***
## I(1/MinDist^2)        1.320e-04  1.904e-05   6.931 5.25e-12 ***
## I(1/dist_wrp^2)      -9.831e-06  1.268e-06  -7.752 1.29e-14 ***
## capacity              1.202e-02  8.420e-04  14.280  < 2e-16 ***
## Converted_Angle      -2.608e-03  9.596e-04  -2.718 0.006616 ** 
## monthly_oil_1km       6.932e-05  2.247e-05   3.086 0.002053 ** 
## monthly_gas_1km       4.888e-05  1.016e-04   0.481 0.630348    
## active_1km           -2.584e-02  1.482e-02  -1.743 0.081413 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## s(as.numeric(month))                   7.559      8 19.58  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.981      9 85.03  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 33/35
## R-sq.(adj) =  0.421   Deviance explained = 42.8%
## GCV = 0.21297  Scale est. = 0.21037   n = 2664
plot(h2s_da_model_b)

b <- getViz(h2s_da_model_b)
plot(sm(b, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_c <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity + 
                          I(1/dist_wrp^2) +
                          
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     monthly_oil_1km + monthly_gas_1km + active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4.092e+00  8.277e-01  -4.944 8.13e-07 ***
## year2023             -1.502e-01  5.687e-02  -2.642 0.008293 ** 
## as.factor(weekday).L  8.210e-02  2.362e-02   3.475 0.000518 ***
## as.factor(weekday).Q -1.601e-01  2.370e-02  -6.757 1.73e-11 ***
## as.factor(weekday).C  1.946e-02  2.359e-02   0.825 0.409569    
## as.factor(weekday)^4 -6.335e-03  2.366e-02  -0.268 0.788954    
## as.factor(weekday)^5 -5.012e-02  2.359e-02  -2.124 0.033761 *  
## as.factor(weekday)^6 -4.475e-02  2.365e-02  -1.892 0.058579 .  
## wd_avg                7.928e-04  1.097e-04   7.230 6.31e-13 ***
## ws_avg               -1.152e-01  7.196e-03 -16.005  < 2e-16 ***
## daily_downwind_ref    2.708e-01  4.257e-02   6.362 2.35e-10 ***
## capacity              1.264e-02  8.169e-04  15.474  < 2e-16 ***
## I(1/dist_wrp^2)      -1.222e-05  1.314e-06  -9.304  < 2e-16 ***
## monthly_oil_1km       6.987e-05  2.249e-05   3.107 0.001913 ** 
## monthly_gas_1km       4.838e-05  1.017e-04   0.476 0.634221    
## active_1km           -2.578e-02  1.484e-02  -1.737 0.082429 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## s(as.numeric(month))                   7.558      8 19.47  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.992      9 90.61  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 32/33
## R-sq.(adj) =   0.42   Deviance explained = 42.6%
## GCV = 0.21339  Scale est. = 0.21086   n = 2664
plot(h2s_da_model_c)

c <- getViz(h2s_da_model_c)
plot(sm(c, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_d <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity + 
                          I(1/dist_wrp^2) +
                          
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     monthly_oil_1km + monthly_gas_1km + active_1km + daily_downwind_wrp + 
##     elevation + EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1.080e+00  1.012e+00   1.068  0.28574    
## year2023                 -1.517e-01  5.678e-02  -2.671  0.00760 ** 
## as.character(weekday)Mon -9.035e-02  3.341e-02  -2.705  0.00688 ** 
## as.character(weekday)Sat -9.858e-02  3.336e-02  -2.955  0.00315 ** 
## as.character(weekday)Sun -1.974e-01  3.323e-02  -5.940 3.23e-09 ***
## as.character(weekday)Thu -4.795e-02  3.340e-02  -1.436  0.15121    
## as.character(weekday)Tue -8.359e-03  3.317e-02  -0.252  0.80104    
## as.character(weekday)Wed  3.880e-02  3.354e-02   1.157  0.24744    
## wd_avg                    7.977e-04  1.095e-04   7.283 4.30e-13 ***
## ws_avg                   -1.143e-01  7.206e-03 -15.858  < 2e-16 ***
## daily_downwind_ref        2.737e-01  4.248e-02   6.444 1.38e-10 ***
## capacity                  1.097e-03  1.436e-03   0.764  0.44488    
## I(1/dist_wrp^2)          -8.660e-08  5.329e-08  -1.625  0.10426    
## monthly_oil_1km           6.917e-05  2.246e-05   3.081  0.00209 ** 
## monthly_gas_1km           4.965e-05  1.015e-04   0.489  0.62483    
## active_1km               -2.562e-02  1.481e-02  -1.730  0.08375 .  
## daily_downwind_wrp        5.139e-02  4.297e-02   1.196  0.23182    
## elevation                -1.412e-02  8.007e-03  -1.764  0.07790 .  
## EVI                      -1.067e+00  1.446e-01  -7.374 2.21e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df     F p-value    
## s(as.numeric(month))                   7.553  8.000 19.57  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 7.916  7.997 51.72  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 35/36
## R-sq.(adj) =  0.421   Deviance explained = 42.8%
## GCV = 0.21302  Scale est. = 0.21034   n = 2664
plot(h2s_da_model_d)

d <- getViz(h2s_da_model_d)
plot(sm(d, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_e <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity + 
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.705e+00  7.312e-01  -5.067 4.33e-07 ***
## year2023                  3.857e-01  1.227e-01   3.144 0.001688 ** 
## as.character(weekday)Mon -9.050e-02  2.665e-02  -3.396 0.000695 ***
## as.character(weekday)Sat -9.599e-02  2.661e-02  -3.607 0.000316 ***
## as.character(weekday)Sun -1.993e-01  2.653e-02  -7.513 7.97e-14 ***
## as.character(weekday)Thu -4.584e-02  2.663e-02  -1.721 0.085311 .  
## as.character(weekday)Tue -1.143e-02  2.646e-02  -0.432 0.665634    
## as.character(weekday)Wed  3.472e-02  2.675e-02   1.298 0.194507    
## wd_avg                    6.304e-04  9.123e-05   6.910 6.10e-12 ***
## ws_avg                   -1.064e-01  6.042e-03 -17.612  < 2e-16 ***
## daily_downwind_ref        2.073e-01  3.439e-02   6.029 1.89e-09 ***
## capacity                  1.147e-02  1.482e-03   7.741 1.41e-14 ***
## I(1/dist_wrp^2)           1.399e-07  1.185e-07   1.181 0.237685    
## monthly_oil_1km           1.385e-04  4.413e-05   3.138 0.001719 ** 
## monthly_gas_1km           3.211e-04  2.507e-04   1.281 0.200372    
## active_1km               -4.537e-02  1.792e-02  -2.532 0.011400 *  
## daily_downwind_wrp        5.485e-02  3.489e-02   1.572 0.116065    
## elevation                -2.933e-02  1.071e-02  -2.738 0.006223 ** 
## EVI                      -6.902e-01  1.770e-01  -3.900 9.86e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.841  8.000 12.37
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.471  8.471 21.76
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.879 76.000 20.86
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 110/112
## R-sq.(adj) =  0.633   Deviance explained = 64.8%
## GCV = 0.13916  Scale est. = 0.13343   n = 2664
plot(h2s_da_model_e)

e <- getViz(h2s_da_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
# 
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Since feb 2022
h2s_da_model_f <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_f)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -4.696e+00  7.111e-01  -6.603 4.88e-11 ***
## year2023                  4.435e-01  1.232e-01   3.600 0.000324 ***
## as.character(weekday)Mon -8.809e-02  2.550e-02  -3.454 0.000561 ***
## as.character(weekday)Sat -9.022e-02  2.540e-02  -3.552 0.000389 ***
## as.character(weekday)Sun -2.061e-01  2.542e-02  -8.108 7.92e-16 ***
## as.character(weekday)Thu -3.686e-02  2.542e-02  -1.450 0.147173    
## as.character(weekday)Tue -8.548e-03  2.527e-02  -0.338 0.735230    
## as.character(weekday)Wed  2.998e-02  2.555e-02   1.173 0.240810    
## wd_avg                    3.508e-04  9.030e-05   3.885 0.000105 ***
## ws_avg                   -7.855e-02  6.026e-03 -13.035  < 2e-16 ***
## daily_downwind_ref        1.567e-01  3.302e-02   4.744 2.21e-06 ***
## capacity                  1.003e-02  1.427e-03   7.031 2.63e-12 ***
## I(1/dist_wrp^2)           2.122e-07  1.133e-07   1.873 0.061128 .  
## monthly_oil_1km           1.611e-04  4.233e-05   3.805 0.000145 ***
## monthly_gas_1km          -1.306e-04  2.429e-04  -0.538 0.590730    
## active_1km               -9.257e-03  1.728e-02  -0.536 0.592319    
## daily_downwind_wrp        6.698e-02  3.329e-02   2.012 0.044311 *  
## elevation                -2.960e-02  1.022e-02  -2.897 0.003804 ** 
## EVI                      -7.131e-01  1.698e-01  -4.200 2.76e-05 ***
## num_odor_complaints       2.552e-02  1.252e-02   2.038 0.041693 *  
## I(1/dist_dc^2)            2.080e-05  9.122e-06   2.281 0.022659 *  
## avg_temp                  1.206e-02  2.685e-03   4.491 7.42e-06 ***
## avg_hum                  -5.599e-03  7.027e-04  -7.968 2.42e-15 ***
## precip                   -7.076e-02  3.637e-02  -1.945 0.051851 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df     F
## s(as.numeric(month))                                     7.854  8.000 13.30
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.471  8.471 23.66
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.878 76.000 23.14
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 114/117
## R-sq.(adj) =  0.666   Deviance explained =   68%
## GCV = 0.12669  Scale est. = 0.12128   n = 2664
plot(h2s_da_model_f)

f <- getViz(h2s_da_model_f)
plot(sm(f, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
# 
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Disaster only
h2s_da_model_g <- gam(H2S_daily_avg ~ month + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(year == '2021', month %in% c('10', '11', '12')))
summary(h2s_da_model_g)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ month + as.character(weekday) + wd_avg + ws_avg + 
##     daily_downwind_ref + capacity + I(1/dist_wrp^2) + s(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3), 
##     I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2, 
##         1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km + daily_downwind_wrp + elevation + EVI + num_odor_complaints + 
##     I(1/dist_dc^2) + avg_temp + avg_hum + precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.2213924  0.0934139  -2.370  0.01804 *  
## month11                  -4.2142854  1.7782277  -2.370  0.01804 *  
## month12                   1.1011938  0.4647034   2.370  0.01806 *  
## as.character(weekday)Mon -2.7259172  2.9255211  -0.932  0.35176    
## as.character(weekday)Sat -2.8891542  2.8298781  -1.021  0.30761    
## as.character(weekday)Sun -2.4956249  2.8514127  -0.875  0.38173    
## as.character(weekday)Thu -0.3339398  2.8872365  -0.116  0.90795    
## as.character(weekday)Tue -4.5461381  2.9772550  -1.527  0.12720    
## as.character(weekday)Wed -1.2618609  2.9362292  -0.430  0.66750    
## wd_avg                    0.0080694  0.0093628   0.862  0.38904    
## ws_avg                   -0.0079586  0.7510760  -0.011  0.99155    
## daily_downwind_ref       -2.7311456  2.7658732  -0.987  0.32375    
## capacity                  0.7818623  0.0804573   9.718  < 2e-16 ***
## I(1/dist_wrp^2)          -0.0001603  0.0000167  -9.600  < 2e-16 ***
## monthly_oil_1km          -0.0058962  0.0097159  -0.607  0.54413    
## monthly_gas_1km           0.0147016  0.0338346   0.435  0.66404    
## active_1km               -7.2923513  3.0769295  -2.370  0.01804 *  
## daily_downwind_wrp       -0.8402903  3.3511018  -0.251  0.80208    
## elevation                 2.7201937  0.8943859   3.041  0.00244 ** 
## EVI                      90.9526953 11.8445995   7.679 5.05e-14 ***
## num_odor_complaints       0.5226646  0.0988284   5.289 1.62e-07 ***
## I(1/dist_dc^2)            1.0786879  0.1100253   9.804  < 2e-16 ***
## avg_temp                  0.3827439  0.2785720   1.374  0.16987    
## avg_hum                   0.0272100  0.0665065   0.409  0.68256    
## precip                   -2.5913331  2.9921008  -0.866  0.38674    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.926  8.989 12.392
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 52.823 80.000  3.689
##                                                         p-value    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 109/114
## R-sq.(adj) =  0.501   Deviance explained =   55%
## GCV = 476.76  Scale est. = 429.63    n = 827
plot(h2s_da_model_g)

g <- getViz(h2s_da_model_g)
plot(sm(g, 1)) + l_fitRaster() + l_fitContour() + l_points()

# Exclude disaster
h2s_da_model_h <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
summary(h2s_da_model_h)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -4.523e+00  1.295e+00  -3.493 0.000482 ***
## year2021                  3.333e-01  9.799e-02   3.402 0.000674 ***
## year2022                 -5.309e-01  2.081e-01  -2.551 0.010766 *  
## year2023                 -6.752e-01  2.155e-01  -3.133 0.001738 ** 
## as.character(weekday)Mon -5.458e-02  2.341e-02  -2.331 0.019771 *  
## as.character(weekday)Sat -8.351e-02  2.322e-02  -3.596 0.000325 ***
## as.character(weekday)Sun -1.702e-01  2.328e-02  -7.313 2.96e-13 ***
## as.character(weekday)Thu -1.154e-02  2.329e-02  -0.496 0.620154    
## as.character(weekday)Tue -1.368e-02  2.326e-02  -0.588 0.556374    
## as.character(weekday)Wed  1.616e-02  2.341e-02   0.690 0.490096    
## wd_avg                    1.512e-04  8.622e-05   1.754 0.079466 .  
## ws_avg                   -7.304e-02  5.490e-03 -13.304  < 2e-16 ***
## daily_downwind_ref        2.274e-02  2.396e-02   0.949 0.342705    
## capacity                  1.873e-02  2.376e-03   7.884 3.73e-15 ***
## I(1/dist_wrp^2)          -4.082e-06  1.369e-06  -2.983 0.002870 ** 
## monthly_oil_1km          -4.735e-05  1.804e-05  -2.625 0.008694 ** 
## monthly_gas_1km          -2.152e-04  7.046e-05  -3.054 0.002271 ** 
## active_1km               -4.449e-02  7.093e-03  -6.273 3.80e-10 ***
## daily_downwind_wrp        2.858e-02  2.842e-02   1.006 0.314673    
## elevation                 7.088e-02  1.153e-02   6.149 8.28e-10 ***
## EVI                       9.990e-01  2.838e-01   3.520 0.000435 ***
## num_odor_complaints       2.815e-02  1.119e-02   2.516 0.011891 *  
## I(1/dist_dc^2)            3.415e-02  8.835e-03   3.865 0.000112 ***
## avg_temp                  5.698e-03  2.075e-03   2.747 0.006037 ** 
## avg_hum                  -6.004e-03  5.830e-04 -10.299  < 2e-16 ***
## precip                   -3.164e-02  4.322e-02  -0.732 0.464151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df     F
## s(as.numeric(month))                                     7.92      8 44.56
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.00      9 24.04
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 74.52     80 23.17
##                                                         p-value    
## s(as.numeric(month))                                     <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =  0.462   Deviance explained = 47.2%
## GCV = 0.24227  Scale est. = 0.23773   n = 6162
plot(h2s_da_model_h)

h <- getViz(h2s_da_model_h)
plot(sm(h, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Disaster indicator
h2s_da_model_i <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip + disaster, 
                      data = daily_full %>% mutate(disaster = if_else(year == '2021', 
                                                                      month %in% c('10', '11', '12'), 1, 0)))
summary(h2s_da_model_i)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip + disaster
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.573e+02  1.399e+01 -25.530  < 2e-16 ***
## year2021                  3.989e+00  1.666e+00   2.394  0.01667 *  
## year2022                  5.441e+00  2.075e+00   2.622  0.00875 ** 
## year2023                  4.519e+00  2.475e+00   1.826  0.06786 .  
## as.character(weekday)Mon -3.258e-01  3.764e-01  -0.866  0.38676    
## as.character(weekday)Sat -2.928e-01  3.732e-01  -0.785  0.43272    
## as.character(weekday)Sun -3.098e-01  3.734e-01  -0.830  0.40675    
## as.character(weekday)Thu -2.927e-01  3.745e-01  -0.782  0.43448    
## as.character(weekday)Tue -6.662e-01  3.743e-01  -1.780  0.07516 .  
## as.character(weekday)Wed -3.629e-01  3.757e-01  -0.966  0.33414    
## wd_avg                    1.248e-03  1.348e-03   0.926  0.35423    
## ws_avg                   -7.136e-02  8.683e-02  -0.822  0.41120    
## daily_downwind_ref       -1.815e-02  3.815e-01  -0.048  0.96205    
## capacity                  6.881e-01  2.416e-02  28.477  < 2e-16 ***
## I(1/dist_wrp^2)          -2.730e-04  1.995e-05 -13.687  < 2e-16 ***
## monthly_oil_1km           3.638e-04  2.667e-04   1.364  0.17256    
## monthly_gas_1km           7.303e-04  9.757e-04   0.748  0.45420    
## active_1km                9.149e-02  9.378e-02   0.976  0.32928    
## daily_downwind_wrp        4.554e-01  4.533e-01   1.005  0.31508    
## elevation                 2.536e+00  1.487e-01  17.053  < 2e-16 ***
## EVI                       7.930e+01  3.021e+00  26.253  < 2e-16 ***
## num_odor_complaints       9.522e-01  2.756e-02  34.552  < 2e-16 ***
## I(1/dist_dc^2)            2.030e+00  1.117e-01  18.171  < 2e-16 ***
## avg_temp                  1.727e-02  3.066e-02   0.563  0.57323    
## avg_hum                  -2.637e-04  8.511e-03  -0.031  0.97529    
## precip                   -4.241e-02  5.876e-01  -0.072  0.94246    
## disaster                  3.211e+00  1.026e+00   3.130  0.00176 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     1.783  8.000  0.327
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   8.957  8.997 94.223
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.836 80.000  6.382
##                                                         p-value    
## s(as.numeric(month))                                      0.186    
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 122/124
## R-sq.(adj) =  0.331   Deviance explained = 34.2%
## GCV =  70.74  Scale est. = 69.57     n = 6989
plot(h2s_da_model_i)

i <- getViz(h2s_da_model_i)
plot(sm(i, 2)) + l_fitRaster() + l_fitContour() + l_points()

plotRGL(sm(i, 2), fix = c(`as.numeric(day)` = 0), residuals = F)

# try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_i), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Everything
h2s_da_model_j <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + capacity +
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
                           k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                        I(1/dist_dc^2) + avg_temp + avg_hum + precip, 
                      data = daily_full)
summary(h2s_da_model_j)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + 
##     precip
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.496e+02  1.385e+01 -25.237   <2e-16 ***
## year2021                  7.052e-01  1.335e+00   0.528   0.5974    
## year2022                  1.405e+00  1.689e+00   0.832   0.4055    
## year2023                  6.499e-01  2.180e+00   0.298   0.7656    
## as.character(weekday)Mon -3.333e-01  3.772e-01  -0.884   0.3769    
## as.character(weekday)Sat -2.687e-01  3.740e-01  -0.719   0.4724    
## as.character(weekday)Sun -2.956e-01  3.742e-01  -0.790   0.4297    
## as.character(weekday)Thu -3.134e-01  3.753e-01  -0.835   0.4036    
## as.character(weekday)Tue -6.738e-01  3.751e-01  -1.796   0.0725 .  
## as.character(weekday)Wed -3.640e-01  3.765e-01  -0.967   0.3337    
## wd_avg                    1.248e-03  1.350e-03   0.925   0.3551    
## ws_avg                   -7.657e-02  8.704e-02  -0.880   0.3790    
## daily_downwind_ref       -7.636e-02  3.820e-01  -0.200   0.8416    
## capacity                  6.803e-01  2.395e-02  28.403   <2e-16 ***
## I(1/dist_wrp^2)          -2.625e-04  1.033e-05 -25.420   <2e-16 ***
## monthly_oil_1km           4.951e-04  2.608e-04   1.899   0.0577 .  
## monthly_gas_1km           3.955e-04  9.490e-04   0.417   0.6769    
## active_1km                1.023e-01  9.493e-02   1.077   0.2814    
## daily_downwind_wrp        3.830e-01  4.538e-01   0.844   0.3986    
## elevation                 2.481e+00  1.435e-01  17.287   <2e-16 ***
## EVI                       7.803e+01  2.932e+00  26.614   <2e-16 ***
## num_odor_complaints       9.839e-01  2.678e-02  36.738   <2e-16 ***
## I(1/dist_dc^2)            1.967e+00  7.384e-02  26.644   <2e-16 ***
## avg_temp                  2.239e-02  3.056e-02   0.733   0.4638    
## avg_hum                  -1.267e-03  8.550e-03  -0.148   0.8822    
## precip                   -1.722e-01  5.868e-01  -0.293   0.7692    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     2.676      8  0.630
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 91.536
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 68.173     80  5.593
##                                                         p-value    
## s(as.numeric(month))                                     0.0709 .  
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 121/123
## R-sq.(adj) =  0.328   Deviance explained = 33.8%
## GCV = 70.933  Scale est. = 69.879    n = 6989
plot(h2s_da_model_j)

knitr::kable(tibble(Model = c('Since Feb 2022',
                              'Disaster Only',
                              'Exclude Disaster',
                              'Everything w Disaster Indicator',
                              'Everything w.o Disaster Indicator'),
                    'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2),
                                    round(summary(h2s_da_model_j)$r.sq, 2))))
Model R-Sq
Since Feb 2022 0.67
Disaster Only 0.50
Exclude Disaster 0.46
Everything w Disaster Indicator 0.33
Everything w.o Disaster Indicator 0.33

80/20 Cross Validation on Daily Average models

result <- tibble(Model = character(),
                 '80/20 Train R-Sq' = numeric(),
                 '80/20 Test R-Sq' = numeric())

validation_result_8020 <- tibble(Model = character(),
                            'Coef' = numeric(),
                            'R-Sq' = numeric())

predictors <- c('H2S_daily_avg', 'month', 'year', 'weekday', 'wd_avg', 'ws_avg', 'daily_downwind_ref', 'dist_wrp', 
                'mon_utm_x', 'mon_utm_y', 'day', 'monthly_oil_1km', 'monthly_gas_1km', 
                'active_1km', 'daily_downwind_wrp', 'elevation', 'EVI', 'num_odor_complaints',
                'dist_dc', 'capacity', 'avg_temp', 'avg_hum', 'precip') 

set.seed(123)

# model 1: since Feb 2022
train <- daily_full %>% 
  filter(day >= '2022-02-01') %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  slice_sample(prop = 0.8)

test <- anti_join(daily_full %>% 
                  filter(day >= '2022-02-01') %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f2 <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f2, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2)$n - 1)/
  (summary(h2s_da_model_f2)$n - summary(h2s_da_model_f2)$np - 1)

result <- rbind(result, tibble(Model = 'Since Feb 2022',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f2)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

f2_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 GAM') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Since Feb 2022',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))

# Model 2: Disaster (Oct-Dec 2021) only
train <- daily_full %>% 
  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  slice_sample(prop = 0.8)

test <- anti_join(daily_full %>% 
                  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f3 <- gam(H2S_daily_avg ~ month + as.character(weekday) +
                       wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                       s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                       te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                          k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                       monthly_oil_1km + monthly_gas_1km + active_1km + 
                       daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                       capacity + dist_dc + avg_temp + avg_hum + precip, 
                      data = train)

predictions <- predict(h2s_da_model_f3, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3)$n - 1)/
  (summary(h2s_da_model_f3)$n - summary(h2s_da_model_f3)$np - 1)

result <- rbind(result, tibble(Model = 'Disaster Only',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f3)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

f3_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only GAM') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Disaster Only',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))

# Model 3: Excluding Disaster
train <- daily_full %>% 
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  slice_sample(prop = 0.8)

test <- anti_join(daily_full %>% 
                  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f4 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f4, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4)$n - 1)/
  (summary(h2s_da_model_f4)$n - summary(h2s_da_model_f4)$np - 1)

result <- rbind(result, tibble(Model = 'Exclude Disaster',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f4)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

# Model 4: Everything with Disaster Indicator
train <- daily_full %>% 
  select(all_of(predictors)) %>% 
  mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>% 
  filter(complete.cases(.)) 

train <- rbind(
  train %>% filter(disaster==1) %>% slice_sample(prop = 0.8),
  train %>% filter(disaster==0) %>% slice_sample(prop = 0.8)
)

test <- anti_join(daily_full %>% 
                  select(all_of(predictors)) %>% 
                    mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip, disaster)`
h2s_da_model_f5 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         disaster +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f5, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5)$n - 1)/
  (summary(h2s_da_model_f5)$n - summary(h2s_da_model_f5)$np - 1)

result <- rbind(result, tibble(Model = 'Everything w. Disaster Indicator',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f5)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

# Model 5: Everything
train <- daily_full %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.)) %>% 
  slice_sample(prop = 0.8)

train <- rbind(
  train %>% filter(year == '2021' & month %in% c('10', '11', '12')) %>% slice_sample(prop = 0.8),
  train %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% slice_sample(prop = 0.8)
  )

test <- anti_join(daily_full %>% 
                  select(all_of(predictors)) %>% 
                  filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f6 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)

predictions <- predict(h2s_da_model_f6, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f6)$n - 1)/
  (summary(h2s_da_model_f5)$n - summary(h2s_da_model_f6)$np - 1)

result <- rbind(result, tibble(Model = 'Everything w.o Disaster Indicator',
                               '80/20 Train R-Sq' = summary(h2s_da_model_f6)$r.sq,
                               '80/20 Test R-Sq' = adj_r2_test))

f6_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator Gam') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
f6_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        xlim(NA, 15) + ylim(NA, 15) +
                        theme_bw()

validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Everything',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))
knitr::kable(tibble(Model = c('Since Feb 2022',
                              'Disaster Only',
                              'Exclude Disaster',
                              'Everything w Disaster Indicator',
                              'Everything w.o Disaster Indicator'),
                    'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2),
                                    round(summary(h2s_da_model_j)$r.sq, 2))) %>% 
               left_join(result, join_by(Model)))
Model R-Sq 80/20 Train R-Sq 80/20 Test R-Sq
Since Feb 2022 0.67 0.6538821 0.6823609
Disaster Only 0.50 0.5175137 0.2921882
Exclude Disaster 0.46 0.5189897 0.2950658
Everything w Disaster Indicator 0.33 NA NA
Everything w.o Disaster Indicator 0.33 0.3372143 0.4443146

Cross Validation on each monitor

result_cv <- tibble(Model = character(),
                 'CV Avg Train R-Sq' = numeric(),
                 'CV Avg Test R-Sq' = numeric())

# model 1: since Feb 2022
post2022feb <- daily_full %>% 
  filter(day >= '2022-02-01') %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(post2022feb$Monitor)

cv1_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- post2022feb %>%
    filter(Monitor != monitors[i])
  
  test <- post2022feb %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f2b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f2b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
    (summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
  
  cv1_result <- rbind(cv1_result, tibble(monitors[i],
                                         summary(h2s_da_model_f2b)$r.sq, 
                                         summary(h2s_da_model_f2b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Since Feb 2022',
                                           'CV Avg Train R-Sq' = mean(cv1_result$`summary(h2s_da_model_f2b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv1_result$adj_r2_test))))

# model 2: Disaster only
disaster <- daily_full %>% 
  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(disaster$Monitor)

cv2_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- disaster %>%
    filter(Monitor != monitors[i])
  
  test <- disaster %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f3b <- gam(H2S_daily_avg~ month + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f3b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3b)$n - 1)/
    (summary(h2s_da_model_f3b)$n - summary(h2s_da_model_f3b)$np - 1)
  
  cv2_result <- rbind(cv2_result, tibble(monitors[i],
                                         summary(h2s_da_model_f3b)$r.sq, 
                                         summary(h2s_da_model_f3b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Disaster Only',
                                           'CV Avg Train R-Sq' = mean(cv2_result$`summary(h2s_da_model_f3b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv2_result$adj_r2_test))))

# model 3: Exclude Disaster
exclude_disaster <- daily_full %>% 
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(exclude_disaster$Monitor)

cv3_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- exclude_disaster %>%
    filter(Monitor != monitors[i])
  
  test <- exclude_disaster %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f4b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f4b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4b)$n - 1)/
    (summary(h2s_da_model_f4b)$n - summary(h2s_da_model_f4b)$np - 1)
  
  cv3_result <- rbind(cv3_result, tibble(monitors[i],
                                         summary(h2s_da_model_f4b)$r.sq, 
                                         summary(h2s_da_model_f4b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Exclude Disaster',
                                           'CV Avg Train R-Sq' = mean(cv3_result$`summary(h2s_da_model_f4b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv3_result$adj_r2_test))))

# model 4: Everything w Disaster Indicator 
full_complete <- daily_full %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.)) %>%
  mutate(disaster = if_else(year == '2021' &  month %in% c('10', '11', '12'), 1, 0))

monitors <- unique(full_complete$Monitor)

cv4_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- full_complete %>%
    filter(Monitor != monitors[i])
  
  test <- full_complete %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f5b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         disaster + capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f5b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5b)$n - 1)/
    (summary(h2s_da_model_f5b)$n - summary(h2s_da_model_f5b)$np - 1)
  
  cv4_result <- rbind(cv4_result, tibble(monitors[i],
                                         summary(h2s_da_model_f5b)$r.sq, 
                                         summary(h2s_da_model_f5b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Everything w. Disaster Indicator',
                                           'CV Avg Train R-Sq' = mean(cv4_result$`summary(h2s_da_model_f5b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv4_result$adj_r2_test))))

# model 5: Everything w.o Disaster Indicator 
full_complete <- daily_full %>% 
  select(all_of(predictors), Monitor) %>% 
  filter(complete.cases(.))

monitors <- unique(full_complete$Monitor)

cv5_result <- tibble(Monitor = character(),
                     'Train Adj-R2' = numeric(),
                     'Train N' = numeric(),
                     'Test Adj-R2' = numeric(),
                     'Test B' = numeric()
                     )

for (i in 1:length(monitors)) {
  train <- full_complete %>%
    filter(Monitor != monitors[i])
  
  test <- full_complete %>%
    filter(Monitor == monitors[i])
  
  h2s_da_model_f6b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_f6b, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f6b)$n - 1)/
    (summary(h2s_da_model_f6b)$n - summary(h2s_da_model_f6b)$np - 1)
  
  cv5_result <- rbind(cv5_result, tibble(monitors[i],
                                         summary(h2s_da_model_f6b)$r.sq, 
                                         summary(h2s_da_model_f6b)$n, 
                                         adj_r2_test, 
                                         nrow(test)))
}

result_cv <- rbind(result_cv, tibble(tibble(Model = 'Everything w.o Disaster Indicator',
                                           'CV Avg Train R-Sq' = mean(cv5_result$`summary(h2s_da_model_f6b)$r.sq`),
                                           'CV Avg Test R-Sq' = mean(cv5_result$adj_r2_test))))

cv1_result
cv2_result
cv3_result
cv4_result
cv5_result

10-fold CV

set.seed(1)
random_seeds <- floor(runif(5, min=0, max=1)*100)
result_10cv <- tibble(Model = character(),
                 '10CV AVG Train R-Sq' = numeric(),
                 '10CV AVG Test R-Sq' = numeric())

validation_result_10cv <- tibble(Model = character(),
                            'Coef' = numeric(),
                            'R-Sq' = numeric())

# model 1: since Feb 2022
obs_10cv_sincefeb2022 <- c()
pred_10cv_sincefeb2022 <- c()
adj_r2_sincefeb2022 <- c()
adj_r2_test_sincefeb2022 <- c()

sincefeb2022 <- daily_full %>% 
filter(day >= '2022-02-01') %>% 
select(all_of(predictors)) %>% 
filter(complete.cases(.))

set.seed(random_seeds[1])
folds <- createFolds(seq(1, nrow(sincefeb2022)), k = 10)
  
for (fold in seq(1, 10)) {
  test <- sincefeb2022[folds[[fold]], ]
  train <- anti_join(sincefeb2022, test, by = join_by(dist_dc, day))
  
  h2s_da_model_10cv <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                           wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                           s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                           te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                              k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                           monthly_oil_1km + monthly_gas_1km + active_1km + 
                           daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                           capacity + dist_dc + avg_temp + avg_hum + precip, 
                          data = train)
  
  predictions <- predict(h2s_da_model_10cv, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
    (summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
  
  obs_10cv_sincefeb2022 <- append(obs_10cv_sincefeb2022, test %>% pull(H2S_daily_avg))
  pred_10cv_sincefeb2022 <- append(pred_10cv_sincefeb2022, predictions)
  adj_r2_sincefeb2022 <- append(adj_r2_sincefeb2022, summary(h2s_da_model_10cv)$r.sq)
  adj_r2_test_sincefeb2022 <- append(adj_r2_sincefeb2022, adj_r2_test)
}

result_10cv <- rbind(result_10cv, tibble(Model = 'Since Feb 2022',
                               '10CV AVG Train R-Sq' = mean(adj_r2_sincefeb2022),
                               '10CV AVG Test R-Sq' = mean(adj_r2_test_sincefeb2022)))

f2_10cv_obs_vs_pred_plot <- ggplot(tibble(obs = obs_10cv_sincefeb2022, pred = pred_10cv_sincefeb2022),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 GAM 10CV') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_10cv <- rbind(validation_result_10cv, 
                           tibble(Model = 'Since Feb 2022',
                                  'Coef' = summary(lm(obs_10cv_sincefeb2022 ~
                                                        pred_10cv_sincefeb2022))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_10cv_sincefeb2022 ~
                                                        pred_10cv_sincefeb2022))$r.squared))

# Model 2: Disaster (Oct-Dec 2021) only
obs_10cv_disaster <- c()
pred_10cv_disaster <- c()
adj_r2_disaster <- c()
adj_r2_test_disaster <- c()

disaster <- daily_full %>% 
  filter(year == '2021', month %in% c('10', '11', '12')) %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.))

set.seed(random_seeds[2])
folds <- createFolds(seq(1, nrow(disaster)), k = 10)

for (fold in seq(1, 10)) {
  test <- disaster[folds[[fold]], ]
  train <- anti_join(disaster, test, by = join_by(dist_dc, day))
  
  h2s_da_model_10cv <- gam(H2S_daily_avg ~ month + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_10cv, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
    (summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
  
  obs_10cv_disaster <- append(obs_10cv_disaster, test %>% pull(H2S_daily_avg))
  pred_10cv_disaster <- append(pred_10cv_disaster, predictions)
  adj_r2_disaster <- append(adj_r2_disaster, summary(h2s_da_model_10cv)$r.sq)
  adj_r2_test_disaster <- append(adj_r2_disaster, adj_r2_test)
}

result_10cv <- rbind(result_10cv, tibble(Model = 'Disaster Only',
                               '10CV AVG Train R-Sq' = mean(adj_r2_disaster),
                               '10CV AVG Test R-Sq' = mean(adj_r2_test_disaster)))

f3_10cv_obs_vs_pred_plot <- ggplot(tibble(obs = obs_10cv_disaster, pred = pred_10cv_disaster),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only GAM 10CV') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_10cv <- rbind(validation_result_10cv, 
                           tibble(Model = 'Disaster Only',
                                  'Coef' = summary(lm(obs_10cv_disaster ~
                                                        pred_10cv_disaster))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_10cv_disaster ~
                                                        pred_10cv_disaster))$r.squared))

# Model 3: Excluding Disaster
obs_10cv_excl_disaster <- c()
pred_10cv_excl_disaster <- c()
adj_r2_excl_disaster <- c()
adj_r2_test_excl_disaster <- c()

excl_disaster <- daily_full %>% 
  filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.))

set.seed(random_seeds[3])
folds <- createFolds(seq(1, nrow(excl_disaster)), k = 10)

for (fold in seq(1, 10)) {
  test <- excl_disaster[folds[[fold]], ]
  train <- anti_join(excl_disaster, test, by = join_by(dist_dc, day))
  
  h2s_da_model_10cv <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_10cv, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
    (summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
  
  obs_10cv_excl_disaster <- append(obs_10cv_excl_disaster, test %>% pull(H2S_daily_avg))
  pred_10cv_excl_disaster <- append(pred_10cv_excl_disaster, predictions)
  adj_r2_excl_disaster <- append(adj_r2_excl_disaster, summary(h2s_da_model_10cv)$r.sq)
  adj_r2_test_excl_disaster <- append(adj_r2_excl_disaster, adj_r2_test)
}

result_10cv <- rbind(result_10cv, tibble(Model = 'Exclude Disaster',
                               '10CV AVG Train R-Sq' = mean(adj_r2_excl_disaster),
                               '10CV AVG Test R-Sq' = mean(adj_r2_test_excl_disaster)))

# Model 4: Everything with Disaster Indicator
obs_10cv_disaster_ind <- c()
pred_10cv_disaster_ind <- c()
adj_r2_disaster_ind <- c()
adj_r2_test_disaster_ind <- c()

disaster_ind <- daily_full %>% 
  select(all_of(predictors)) %>% 
  mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>% 
  filter(complete.cases(.)) 

set.seed(random_seeds[4])
folds_1 <- createFolds(seq(1, nrow(disaster_ind %>% filter(disaster==1))), k = 10)
folds_0 <- createFolds(seq(1, nrow(disaster_ind %>% filter(disaster==0))), k = 10)

for (fold in seq(1, 10)) {
  test <- rbind(disaster_ind[folds_1[[fold]], ], disaster_ind[folds_0[[fold]], ])
  
  train <- anti_join(disaster_ind, test, by = join_by(dist_dc, day))
  
  h2s_da_model_10cv <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_10cv, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
    (summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
  
  obs_10cv_disaster_ind <- append(obs_10cv_disaster_ind, test %>% pull(H2S_daily_avg))
  pred_10cv_disaster_ind <- append(pred_10cv_disaster_ind, predictions)
  adj_r2_disaster_ind <- append(adj_r2_disaster_ind, summary(h2s_da_model_10cv)$r.sq)
  adj_r2_test_disaster_ind <- append(adj_r2_disaster_ind, adj_r2_test)
}

result_10cv <- rbind(result_10cv, tibble(Model = 'Everything w. Disaster Indicator',
                               '10CV AVG Train R-Sq' = mean(adj_r2_disaster_ind),
                               '10CV AVG Test R-Sq' = mean(adj_r2_test_disaster_ind)))


# Model 5: Everything
obs_10cv_everything <- c()
pred_10cv_everything <- c()
adj_r2_everything <- c()
adj_r2_test_everything <- c()

everything <- daily_full %>% 
  select(all_of(predictors)) %>% 
  filter(complete.cases(.))

set.seed(random_seeds[5])
folds <- createFolds(seq(1, nrow(everything), k = 10))
## Warning: In seq.default(1, nrow(everything), k = 10) :
##  extra argument 'k' will be disregarded
for (fold in seq(1, 10)) {
  test <- everything[folds[[fold]], ]
  train <- anti_join(everything, test, by = join_by(dist_dc, day))
  
  h2s_da_model_10cv <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
                            k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                         daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                         capacity + dist_dc + avg_temp + avg_hum + precip, 
                        data = train)
  
  predictions <- predict(h2s_da_model_10cv, newdata = test)
  r2_test <- R2(test$H2S_daily_avg, predictions)
  adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
    (summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
  
  obs_10cv_everything <- append(obs_10cv_everything, test %>% pull(H2S_daily_avg))
  pred_10cv_everything <- append(pred_10cv_everything, predictions)
  adj_r2_everything <- append(adj_r2_everything, summary(h2s_da_model_10cv)$r.sq)
  adj_r2_test_everything <- append(adj_r2_everything, adj_r2_test)
}

result_10cv <- rbind(result_10cv, tibble(Model = 'Everything w.o Disaster Indicator',
                               '10CV AVG Train R-Sq' = mean(adj_r2_everything),
                               '10CV AVG Test R-Sq' = mean(adj_r2_test_everything)))

f6_10cv_obs_vs_pred_plot <- ggplot(tibble(obs = obs_10cv_everything, pred = pred_10cv_everything),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator GAM CV') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
f6_10cv_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = obs_10cv_everything, pred = pred_10cv_everything),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        xlim(NA, 15) + ylim(NA, 50) +
                        theme_bw()

validation_result_10cv <- rbind(validation_result_10cv, 
                           tibble(Model = 'Everything',
                                  'Coef' = summary(lm(obs_10cv_everything ~
                                                        pred_10cv_everything))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_10cv_everything ~
                                                        pred_10cv_everything))$r.squared))

GAM result

knitr::kable(tibble(Model = c('Since Feb 2022',
                              'Disaster Only',
                              'Exclude Disaster',
                              'Everything w. Disaster Indicator',
                              'Everything w.o Disaster Indicator'),
                    'Train R-sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
                                    round(summary(h2s_da_model_g)$r.sq, 2),
                                    round(summary(h2s_da_model_h)$r.sq, 2),
                                    round(summary(h2s_da_model_i)$r.sq, 2),
                                    round(summary(h2s_da_model_j)$r.sq, 2))) %>% 
               left_join(result, join_by(Model)) %>%
               left_join(result_cv, join_by(Model)) %>%
               left_join(result_10cv, join_by(Model)),
             digits = 3)
Model Train R-sq 80/20 Train R-Sq 80/20 Test R-Sq CV Avg Train R-Sq CV Avg Test R-Sq 10CV AVG Train R-Sq 10CV AVG Test R-Sq
Since Feb 2022 0.67 0.654 0.682 0.667 0.236 0.664 0.665
Disaster Only 0.50 0.518 0.292 0.671 0.094 0.497 0.439
Exclude Disaster 0.46 0.519 0.295 0.477 0.085 0.463 0.446
Everything w. Disaster Indicator 0.33 0.386 0.612 0.498 0.115 0.321 0.327
Everything w.o Disaster Indicator 0.33 0.337 0.444 0.496 0.107 0.329 0.341

Observed vs. Predicted plots

ggarrange(f2_obs_vs_pred_plot, 
          f3_obs_vs_pred_plot,
          ggarrange(f6_obs_vs_pred_plot, f6_obs_vs_pred_plot_zoom, ncol = 2, labels = c("3", "4")), 
          labels = c("1", "2"),
          nrow = 3)
## Warning: Removed 21 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 21 rows containing missing values (`geom_point()`).

knitr::kable(validation_result_8020, digits = 3)
Model Coef R-Sq
Since Feb 2022 0.966 0.700
Disaster Only 0.761 0.414
Everything 1.201 0.321
ggarrange(f2_10cv_obs_vs_pred_plot, 
          f3_10cv_obs_vs_pred_plot,
          ggarrange(f6_10cv_obs_vs_pred_plot, f6_10cv_obs_vs_pred_plot_zoom, ncol = 2, labels = c("3", "4")), 
          labels = c("1", "2"),
          nrow = 3)
## Warning: Removed 86 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 86 rows containing missing values (`geom_point()`).

knitr::kable(validation_result_10cv, digits = 3)
Model Coef R-Sq
Since Feb 2022 0.980 0.649
Disaster Only 0.951 0.456
Everything 0.865 0.271

Monthly

h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.469e+00  3.010e-01   4.879 1.07e-06 ***
## year2021                          4.569e+00  1.704e-01  26.817  < 2e-16 ***
## year2022                         -4.981e+00  1.895e-01 -26.285  < 2e-16 ***
## year2023                         -2.272e+00  2.389e-01  -9.510  < 2e-16 ***
## wd_secQ2                         -9.341e-02  1.781e-01  -0.524      0.6    
## wd_secQ3                          1.662e+00  1.811e-01   9.179  < 2e-16 ***
## wd_secQ4                         -8.799e-01  1.712e-01  -5.140 2.74e-07 ***
## ws                                7.904e-02  1.924e-02   4.109 3.98e-05 ***
## I(1/MinDist^2)                    9.127e+05  1.544e+05   5.911 3.41e-09 ***
## RefineryMarathon (Carson)         2.223e+01  2.465e-01  90.170  < 2e-16 ***
## RefineryMarathon (Wilmington)     3.672e+00  2.867e-01  12.808  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)  2.444e+00  2.532e-01   9.652  < 2e-16 ***
## RefineryTorrance Refinery        -1.154e+00  2.332e-01  -4.949 7.48e-07 ***
## RefineryValero Refinery           5.945e+00  2.804e-01  21.202  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                      edf Ref.df    F p-value    
## s(as.numeric(month))   8      8 5716  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0357   Deviance explained = 3.57%
## GCV = 5419.9  Scale est. = 5419.9    n = 2056378
plot(h2s_ma_model_a)

# h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude, longitude, bs='tp', k=10), data = full_data)
# summary(h2s_ma_model_b)
# plot(h2s_ma_model_b)

XGBoost Since 2022 Feb

validation_result <- tibble(Model = character(),
                                 'Coef' = character(),
                                 'R-Sq' = numeric())

xgb_result <- tibble(Model = character(),
                 'R-Sq' = numeric(),
                 '80/20 Train R-Sq' = numeric(),
                 '80/20 Test R-Sq' = numeric())

daily_full <- daily_full %>%
  mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
         Monitor = str_replace_all(Monitor, ' ', '_'),
         weekday = as.character(weekday),
         daily_downwind_ref = as.integer(daily_downwind_ref),
         daily_downwind_wrp = as.integer(daily_downwind_wrp))

train <- daily_full[complete.cases(daily_full),] %>%
  filter(day >= '2022-02-01')

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
                         max_depth = c(3, 4, 5),
                         eta = c(0.1, 0.3),
                         gamma = c(0.01, 0.001),
                         colsample_bytree = c(0.5, 1),
                         min_child_weight = 0,
                         subsample = c(0.5, 0.75, 1))

# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", 
                        number=10,
                        verboseIter=TRUE, 
                        search='grid')

fit.xgb_da <- readRDS('rfiles/fit.xgb_da.rds')
# fit.xgb_da <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da, 'rfiles/fit.xgb_da.rds')
getTrainPerf(fit.xgb_da)
fit.xgb_da$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.2 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 42 
## niter: 500
## nfeatures : 42 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96     500         5 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da,scale=FALSE)
plot(imp, top = 20)

Get model performance at each fold

obs_xgb_sincefeb2022 <- c()
pred_xgb_sincefeb2022 <- c()
r2_test_xgb_sincefeb2022 <- c()

for (fold in seq(1, 10)) {
  test_fold <- train[fit.xgb_da$control$indexOut[[fold]], ]
  test_pred <- predict(fit.xgb_da$finalModel, 
                       newdata = test_fold %>% select(-H2S_daily_avg) %>% as.matrix())
  test_r2 <- postResample(test_pred, test_fold %>% pull(H2S_daily_avg))[[2]]
  
  obs_xgb_sincefeb2022 <- append(obs_xgb_sincefeb2022, test_fold %>% pull(H2S_daily_avg))
  pred_xgb_sincefeb2022 <- append(pred_xgb_sincefeb2022, test_pred)
  r2_test_xgb_sincefeb2022 <- append(r2_test_xgb_sincefeb2022, test_r2)
}
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = obs_xgb_sincefeb2022, pred = pred_xgb_sincefeb2022),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Since Feb 2022',
                                  'Coef' = summary(lm(obs_xgb_sincefeb2022 ~
                                                        pred_xgb_sincefeb2022))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_xgb_sincefeb2022 ~
                                                        pred_xgb_sincefeb2022))$r.squared))

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 3.83e+05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 4000
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small.  fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 40.97
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2.03
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 4.1209
## Warning in sqrt(sum.squares/one.delta): NaNs produced

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 20)

80/20 CV

set.seed(313)

validation_result_8020 <- tibble(Model = character(),
                                 'Coef' = character(),
                                 'R-Sq' = numeric())

train <- daily_full[complete.cases(daily_full),] %>%
  filter(day >= '2022-02-01') %>%
  slice_sample(prop = 0.8)

test <- anti_join(daily_full[complete.cases(daily_full),] %>% filter(day >= '2022-02-01'), 
                  train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

test <- fastDummies::dummy_cols(test %>%
                 select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                           monitor_lat, monitor_lon, county, dist_213)) %>%
                 mutate(MinDist = 1/(MinDist^2),
                        dist_wrp = 1/(dist_wrp^2)),
                 remove_selected_columns = TRUE)


# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
                         max_depth = c(3, 4, 5),
                         eta = c(0.1, 0.3),
                         gamma = c(0.01, 0.001),
                         colsample_bytree = c(0.5, 1),
                         min_child_weight = 0,
                         subsample = c(0.5, 0.75, 1))

# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", 
                        number=10,
                        verboseIter=TRUE, 
                        search='grid')

fit.xgb_da_8020 <- readRDS('rfiles/fit.xgb_da_8020.rds')
# fit.xgb_da_8020 <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_8020, 'rfiles/fit.xgb_da_8020.rds')
getTrainPerf(fit.xgb_da_8020)
# Test statistics
predictions <- predict(fit.xgb_da_8020$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions, test %>% pull(H2S_daily_avg))
test_stat
##      RMSE  Rsquared       MAE 
## 0.2835988 0.7685410 0.1748351
xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Since Feb 2022',
                           'R-Sq' = getTrainPerf(fit.xgb_da)$TrainRsquared,
                           '80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_8020)$TrainRsquared,
                           '80/20 Test R-Sq' = test_stat[[2]]))
xgb_sincefeb2022_obs_vs_pred_plot_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Since 2022 XGBoost') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Since Feb 2022',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))
fit.xgb_da_8020$finalModel
## ##### xgb.Booster
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 42 
## niter: 500
## nfeatures : 42 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96     500         5 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da_8020$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 3.83e+05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 4000
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : Chernobyl! trL>n 9

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : Chernobyl! trL>n 9
## Warning in sqrt(sum.squares/one.delta): NaNs produced

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da_8020$finalModel, top_n = 20)

CV by leaving each monitor out once

post2022feb <- daily_full %>% 
  filter(day >= '2022-02-01')

monitor_names <- unique(post2022feb$Monitor)

# for (i in 1:length(monitor_names)) {
#   train <- post2022feb %>%

#     filter(Monitor != monitor_names[i])
# 
#   train <- train[complete.cases(train),]
# 
#   train <- fastDummies::dummy_cols(train %>%
#                      select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
#                              monitor_lat, monitor_lon, county, dist_213)) %>%
#                      mutate(MinDist = 1/(MinDist^2),
#                             dist_wrp = 1/(dist_wrp^2)),
#                      remove_selected_columns = TRUE)
# 
#   fit.xgb_da <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# 
#   saveRDS(fit.xgb_da, paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
# }
model_stats <- tibble(Monitor = character(), train_r2 = numeric(), test_r2 = numeric())

add_cols <- function(df, cols) {
  add <- cols[!cols %in% names(df)]
  if(length(add) !=0 ) df[add] <- NA
  return(df)
}

for (i in setdiff(1:length(monitor_names), c(9))) {

  fit.xgb_da <- readRDS(paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))

  test <- post2022feb %>%
    filter(Monitor == monitor_names[i])

  test <- fastDummies::dummy_cols(test[complete.cases(test),] %>%
                     ungroup() %>%
                     select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213)) %>%
                       add_cols(c('year_2022', 'year_2023', 'month_01', 'month_02',
                                  'month_03', 'month_04', 'month_05', 'month_06',
                                  'month_07', 'month_08', 'month_09', 'month_10',
                                  'month_11', 'month_12')) %>%
                     mutate(MinDist = 1/(MinDist^2),
                            dist_wrp = 1/(dist_wrp^2),
                            year_2022 = ifelse(is.na(year_2022), 0, year_2022),
                            year_2023 = ifelse(is.na(year_2023), 0, year_2023),
                            month_01 = ifelse(is.na(month_01), 0, month_01),
                            month_02 = ifelse(is.na(month_02), 0, month_02),
                            month_03 = ifelse(is.na(month_03), 0, month_03),
                            month_04 = ifelse(is.na(month_04), 0, month_04),
                            month_05 = ifelse(is.na(month_05), 0, month_05),
                            month_06 = ifelse(is.na(month_06), 0, month_06),
                            month_07 = ifelse(is.na(month_07), 0, month_07),
                            month_08 = ifelse(is.na(month_08), 0, month_08),
                            month_09 = ifelse(is.na(month_09), 0, month_09),
                            month_10 = ifelse(is.na(month_10), 0, month_10),
                            month_11 = ifelse(is.na(month_11), 0, month_11),
                            month_12 = ifelse(is.na(month_12), 0, month_12)),
                     remove_selected_columns = TRUE)

  train_r2 <- getTrainPerf(fit.xgb_da)$TrainRsquared

  predictions <- predict(fit.xgb_da$finalModel,
                         newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())

  test_r2 <- R2(test %>% pull(H2S_daily_avg), predictions)

  model_stats <- rbind(model_stats, tibble(Monitor = monitor_names[i], train_r2, test_r2))
}

model_stats
tibble(Monitor = 'Total Average', train_r2 = mean(model_stats$train_r2, na.rm = T),
                             test_r2 = mean(model_stats$test_r2, na.rm = T))

XGBoost Disaster

Daily average model

train <- daily_full[complete.cases(daily_full),] %>%
  filter(year == '2021', month %in% c('10', '11', '12'))

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

fit.xgb_da_dis <- readRDS('rfiles/fit.xgb_da_dis.rds')
# fit.xgb_da_dis <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_dis, 'rfiles/fit.xgb_da_dis.rds')
getTrainPerf(fit.xgb_da_dis)
fit.xgb_da_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31 
## niter: 500
## nfeatures : 31 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96     500         5 0.1  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis,scale=FALSE)
plot(imp, top = 20)

Get model performance at each fold

obs_xgb_disaster <- c()
pred_xgb_disaster <- c()
r2_test_xgb_disaster <- c()

for (fold in seq(1, 10)) {
  test_fold <- train[fit.xgb_da_dis$control$indexOut[[fold]], ]
  test_pred <- predict(fit.xgb_da_dis$finalModel, 
                       newdata = test_fold %>% select(-H2S_daily_avg) %>% as.matrix())
  test_r2 <- postResample(test_pred, test_fold %>% pull(H2S_daily_avg))[[2]]
  
  obs_xgb_disaster <- append(obs_xgb_disaster, test_fold %>% pull(H2S_daily_avg))
  pred_xgb_disaster <- append(pred_xgb_disaster, test_pred)
  r2_test_xgb_disaster <- append(r2_test_xgb_disaster, test_r2)
}
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = obs_xgb_disaster, pred = pred_xgb_disaster),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for Disaster Only XGBoost') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = obs_xgb_disaster, pred = pred_xgb_disaster),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        xlim(0, 50) + ylim(0, 50) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result, 
                           tibble(Model = 'Disaster Only',
                                  'Coef' = summary(lm(obs_xgb_disaster ~
                                                        pred_xgb_disaster))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_xgb_disaster ~
                                                        pred_xgb_disaster))$r.squared))

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
              model = fit.xgb_da_dis$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
              model = fit.xgb_da_dis$finalModel, top_n = 20)

80/20 CV

set.seed(313)

train <- daily_full[complete.cases(daily_full),] %>%
  filter(year == '2021', month %in% c('10', '11', '12')) %>%
  slice_sample(prop = 0.8)

test <- anti_join(daily_full[complete.cases(daily_full),] %>% filter(year == '2021', month %in% c('10', '11', '12')), 
                  train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

test <- fastDummies::dummy_cols(test %>%
                 select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                           monitor_lat, monitor_lon, county, dist_213, year)) %>%
                 mutate(MinDist = 1/(MinDist^2),
                        dist_wrp = 1/(dist_wrp^2)),
                 remove_selected_columns = TRUE)

fit.xgb_da_dis_8020 <- readRDS('rfiles/fit.xgb_da_dis_8020.rds')
# fit.xgb_da_dis_8020 <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_dis_8020, 'rfiles/fit.xgb_da_dis_8020.rds')
getTrainPerf(fit.xgb_da_dis_8020)
# Test statistics
predictions <- predict(fit.xgb_da_dis_8020$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions, 
             test %>% pull(H2S_daily_avg))
test_stat
##       RMSE   Rsquared        MAE 
## 23.4073974  0.6423252  6.3878800
xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Disaster Only',
                           'R-Sq' = getTrainPerf(fit.xgb_da_dis)$TrainRsquared,
                           '80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_dis_8020)$TrainRsquared,
                           '80/20 Test R-Sq' = test_stat[[2]]))
xgb_dis_obs_vs_pred_plot_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Disaster Only XGBoost') +
                        theme_bw()

xgb_dis_obs_vs_pred_plot_zoom_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        xlim(0, 10) + ylim(0, 10) +
                        theme_bw()
validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Disaster Only',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))
fit.xgb_da_dis_8020$finalModel
## ##### xgb.Booster
## raw: 209.9 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 31 
## niter: 100
## nfeatures : 31 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 205     100         5 0.3  0.01              0.5                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da_dis_8020$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da_dis_8020$finalModel, top_n = 20)

XGBoost Full

Daily average model

train <- daily_full[complete.cases(daily_full),]

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

fit.xgb_da_full <- readRDS('rfiles/fit.xgb_da_full.rds')
# fit.xgb_da_full <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_full, 'rfiles/fit.xgb_da_full.rds')
getTrainPerf(fit.xgb_da_full)
fit.xgb_da_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.2 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 40 
## niter: 500
## nfeatures : 40 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##      nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 204     500         5 0.3  0.01              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full,scale=FALSE)
plot(imp, top = 20)

Get model performance at each fold

obs_xgb_everything <- c()
pred_xgb_everything <- c()
r2_test_xgb_everything <- c()

for (fold in seq(1, 10)) {
  test_fold <- train[fit.xgb_da_full$control$indexOut[[fold]], ]
  test_pred <- predict(fit.xgb_da_full$finalModel, 
                       newdata = test_fold %>% select(-H2S_daily_avg) %>% as.matrix())
  test_r2 <- postResample(test_pred, test_fold %>% pull(H2S_daily_avg))[[2]]
  
  obs_xgb_everything <- append(obs_xgb_everything, test_fold %>% pull(H2S_daily_avg))
  pred_xgb_everything <- append(pred_xgb_everything, test_pred)
  r2_test_xgb_everything <- append(r2_test_xgb_everything, test_r2)
}
xgb_everything_obs_vs_pred_plot <- ggplot(tibble(obs = obs_xgb_everything, pred = pred_xgb_everything),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Observed vs Predicted for everything XGBoost') +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = obs_xgb_everything, pred = pred_xgb_everything),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        xlim(0, 30) + ylim(0, 30) +
                        theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,  
                           tibble(Model = 'Everything w.o Disaster Indicator',
                                  'Coef' = summary(lm(obs_xgb_everything ~
                                                        pred_xgb_everything))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(obs_xgb_everything ~
                                                        pred_xgb_everything))$r.squared))

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
              model = fit.xgb_da_full$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
              model = fit.xgb_da_full$finalModel, top_n = 20)

80/20 CV

set.seed(313)

train <- rbind(
  daily_full[complete.cases(daily_full),] %>%
    filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
    slice_sample(prop = 0.8),
  daily_full[complete.cases(daily_full),] %>%
    filter(year == '2021', month %in% c('10', '11', '12')) %>%
    slice_sample(prop = 0.8)
)

test <- anti_join(daily_full[complete.cases(daily_full),], train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                             monitor_lat, monitor_lon, county, dist_213, year)) %>%
                   mutate(MinDist = 1/(MinDist^2),
                          dist_wrp = 1/(dist_wrp^2)),
                   remove_selected_columns = TRUE)

test <- fastDummies::dummy_cols(test %>%
                 select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
                           monitor_lat, monitor_lon, county, dist_213, year)) %>%
                 mutate(MinDist = 1/(MinDist^2),
                        dist_wrp = 1/(dist_wrp^2)),
                 remove_selected_columns = TRUE)

fit.xgb_da_full_8020 <- readRDS('rfiles/fit.xgb_da_full_8020.rds')
# fit.xgb_da_full_8020 <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_full_8020, 'rfiles/fit.xgb_da_full_8020.rds')
getTrainPerf(fit.xgb_da_full_8020)
# Test statistics
predictions <- predict(fit.xgb_da_full_8020$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions, 
             test %>% pull(H2S_daily_avg))
test_stat
##      RMSE  Rsquared       MAE 
## 2.4223982 0.2317491 0.4381407
xgb_result <- rbind(xgb_result, 
                    tibble(Model = 'Everything w.o Disaster Indicator',
                           'R-Sq' = getTrainPerf(fit.xgb_da_full)$TrainRsquared,
                           '80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_full_8020)$TrainRsquared,
                           '80/20 Test R-Sq' = test_stat[[2]]))
xgb_full_obs_vs_pred_plot_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Everything w.o Disaster Indicator XGBoost') +
                        theme_bw()

xgb_full_obs_vs_pred_plot_zoom_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
                             aes(x = pred, y = obs)) +
                        geom_point() +
                        geom_abline(intercept = 0, slope = 1, color = 'red') +
                        geom_smooth(method = 'lm', formula = y ~ x) +
                        labs(y = 'Observed', x = 'Predicted', 
                             title = 'Zoomed In') +
                        xlim(0, 10) + ylim(0, 10) +
                        theme_bw()
validation_result_8020 <- rbind(validation_result_8020, 
                           tibble(Model = 'Everything',
                                  'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$coefficients[2, 1], 
                                  'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
                                                        predictions))$r.squared))
fit.xgb_da_full_8020$finalModel
## ##### xgb.Booster
## raw: 1.1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.001", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 40 
## niter: 500
## nfeatures : 40 
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 81     500         5 0.1 0.001              0.5                0         1
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)

matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da_full_8020$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da_full_8020$finalModel, top_n = 20)

XGBoost Result

xgb_result

Observed Vs. Predicted Plots 10-fold CV

ggarrange(xgb_sincefeb2022_obs_vs_pred_plot, 
          ggarrange(xgb_disaster_obs_vs_pred_plot, xgb_disaster_obs_vs_pred_plot_zoom,  
                    ncol = 2, labels = c("2", "3")),
          ggarrange(xgb_everything_obs_vs_pred_plot, xgb_everything_obs_vs_pred_plot_zoom, 
                    ncol = 2, labels = c("4", "5")), 
                    labels = c("1"),
                    nrow = 3)
## Warning: Removed 75 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 75 rows containing missing values (`geom_point()`).
## Warning: Removed 161 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 161 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_smooth()`).

knitr::kable(validation_result, digits = 3)
Model Coef R-Sq
Since Feb 2022 1.026 0.989
Disaster Only 0.675 0.666
Everything w.o Disaster Indicator 1.000 1.000

Observed Vs. Predicted Plots 10-fold CV + 80/20 split

ggarrange(xgb_sincefeb2022_obs_vs_pred_plot_8020, 
          ggarrange(xgb_dis_obs_vs_pred_plot_8020, xgb_dis_obs_vs_pred_plot_zoom_8020,  
                    ncol = 2, labels = c("2", "3")),
          ggarrange(xgb_full_obs_vs_pred_plot_8020, xgb_full_obs_vs_pred_plot_zoom_8020, 
                    ncol = 2, labels = c("4", "5")), 
                    labels = c("1"),
                    nrow = 3)
## Warning: Removed 20 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 20 rows containing missing values (`geom_point()`).
## Warning: Removed 29 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 29 rows containing missing values (`geom_point()`).

knitr::kable(validation_result_8020)
Model Coef R-Sq
Since Feb 2022 1.0204725 0.7685410
Disaster Only 0.7062446 0.6423252
Everything 0.9326755 0.2317491

Extreme Value Models

EVGAM

# for this part, I will fit from Jan-2021 to April-2022
ev_data <- daily_full %>%
  filter(day >= '2021-01-01',  day < '2022-05-01') %>%
  mutate(month = as.numeric(month),
         weekday = as.character(weekday),
         day = as.numeric(day),
         dist_wrp = I(1/dist_wrp^2),
         dist_dc = I(1/dist_dc^2),
         mon_utm_x = I(mon_utm_x/10^3), 
         mon_utm_y = I(mon_utm_y/10^3))
daily_max_gev <- list(H2S_daily_max ~ s(month,bs='cc') + year, 
                 ~ s(month,bs='cc') + year +
                   weekday + wd_avg + ws_avg + daily_downwind_ref + 
                   capacity + dist_wrp + 
                   s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +  
                   te(mon_utm_x, mon_utm_y, day, 
                      k=c(10,10),d=c(2,1),bs=c('tp','cc')) + 
                   monthly_oil_1km + monthly_gas_1km + active_1km + 
                   daily_downwind_wrp + elevation + EVI + num_odor_complaints +
                   dist_dc + avg_temp + avg_hum + precip, 
                 ~ 1)

dm_gev <- readRDS('rfiles/dm_gev.rds')
# dm_gev <- evgam(daily_max_gev, ev_data, family = "gev")
# saveRDS(dm_gev, 'rfiles/dm_gev.rds')
summary(dm_gev)
## 
## ** Parametric terms **
## 
## location
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)     1.39       0.02   57.56   <2e-16
## year2022        0.14       0.06    2.39  0.00846
## 
## logscale
##                     Estimate Std. Error       t value Pr(>|t|)
## (Intercept)            -6.61       1.78 -3.720000e+00 9.93e-05
## year2022                0.49       0.14  3.560000e+00 0.000184
## weekdayMon              0.05       0.05  9.800000e-01    0.162
## weekdaySat             -0.01       0.05 -1.900000e-01    0.424
## weekdaySun              0.00       0.05 -4.000000e-02    0.484
## weekdayThu              0.11       0.05  2.170000e+00    0.015
## weekdayTue              0.08       0.05  1.630000e+00   0.0516
## weekdayWed              0.03       0.05  5.100000e-01    0.303
## wd_avg                  0.00       0.00  2.300000e+00   0.0108
## ws_avg                  0.01       0.01  6.000000e-01    0.273
## daily_downwind_ref      0.07       0.05  1.400000e+00   0.0813
## capacity                0.00       0.00 -2.990000e+00   0.0014
## dist_wrp            45424.75       0.00  2.423312e+10   <2e-16
## monthly_oil_1km         0.00       0.00  7.040000e+00 9.96e-13
## monthly_gas_1km         0.00       0.00 -1.058000e+01   <2e-16
## active_1km              0.03       0.03  1.050000e+00    0.148
## daily_downwind_wrp     -0.03       0.06 -5.200000e-01    0.303
## elevation              -0.03       0.01 -3.160000e+00 0.000798
## EVI                    -0.46       0.15 -3.010000e+00  0.00131
## num_odor_complaints     0.05       0.01  8.660000e+00   <2e-16
## dist_dc              3098.92       0.00  2.100542e+07   <2e-16
## avg_temp                0.01       0.00  1.300000e+00   0.0962
## avg_hum                 0.00       0.00 -1.930000e+00   0.0267
## precip                 -0.06       0.08 -7.400000e-01     0.23
## 
## shape
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)     0.21       0.02   11.44   <2e-16
## 
## ** Smooth terms **
## 
## location
##          edf max.df Chi.sq Pr(>|t|)
## s(month) 6.2      8 375.69   <2e-16
## 
## logscale
##                               edf max.df Chi.sq Pr(>|t|)
## s(month)                     7.80      8 356.16   <2e-16
## s(mon_utm_x,mon_utm_y)       5.52      9  27.63 4.12e-05
## te(mon_utm_x,mon_utm_y,day) 54.02     80 290.64   <2e-16
plot(dm_gev)

MGCV

m1 <- readRDS('rfiles/m1.rds')
# m1 <- gam(list(H2S_daily_max ~ s(month,bs='cc') + year,
#                ~ s(month,bs='cc') + year +
#                    weekday + wd_avg + ws_avg + daily_downwind_ref +
#                    capacity + dist_wrp +
#                    s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
#                    te(mon_utm_x, mon_utm_y, day,
#                       k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
#                    monthly_oil_1km + monthly_gas_1km + active_1km +
#                    daily_downwind_wrp + elevation + EVI + num_odor_complaints +
#                    dist_dc + avg_temp + avg_hum + precip,
#                ~ 1),
#           data = ev_data, method = "REML",
#           family = gevlss(link = list("identity", "identity", "identity")))
# saveRDS(m1, 'rfiles/m1.rds')
summary(m1)
## 
## Family: gevlss 
## Link function: identity identity identity 
## 
## Formula:
## H2S_daily_max ~ s(month, bs = "cc") + year
## ~s(month, bs = "cc") + year + weekday + wd_avg + ws_avg + daily_downwind_ref + 
##     capacity + dist_wrp + s(mon_utm_x, mon_utm_y, bs = "tp", 
##     k = 10) + te(mon_utm_x, mon_utm_y, day, k = c(10, 10), d = c(2, 
##     1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km + daily_downwind_wrp + elevation + EVI + num_odor_complaints + 
##     dist_dc + avg_temp + avg_hum + precip
## ~1
## 
## Parametric coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            1.392e+00  2.397e-02  58.102  < 2e-16 ***
## year2022               1.361e-01  5.650e-02   2.408 0.016032 *  
## (Intercept).1         -6.973e+00  1.758e+00  -3.966 7.30e-05 ***
## year2022.1             4.965e-01  1.393e-01   3.565 0.000364 ***
## weekdayMon.1           5.013e-02  5.205e-02   0.963 0.335492    
## weekdaySat.1          -9.562e-03  4.864e-02  -0.197 0.844165    
## weekdaySun.1          -3.141e-03  4.897e-02  -0.064 0.948862    
## weekdayThu.1           1.112e-01  5.116e-02   2.173 0.029786 *  
## weekdayTue.1           8.063e-02  5.002e-02   1.612 0.106935    
## weekdayWed.1           2.570e-02  5.172e-02   0.497 0.619250    
## wd_avg.1               4.441e-04  1.908e-04   2.328 0.019930 *  
## ws_avg.1               5.815e-03  1.002e-02   0.580 0.561639    
## daily_downwind_ref.1   6.469e-02  4.793e-02   1.350 0.177101    
## capacity.1            -1.406e-03  5.051e-04  -2.785 0.005359 ** 
## dist_wrp.1            -3.640e+05  1.663e+06  -0.219 0.826789    
## monthly_oil_1km.1      8.399e-04  1.148e-04   7.313 2.62e-13 ***
## monthly_gas_1km.1     -2.707e-03  2.570e-04 -10.533  < 2e-16 ***
## active_1km.1           2.400e-02  2.542e-02   0.944 0.345128    
## daily_downwind_wrp.1  -3.218e-02  6.327e-02  -0.509 0.611053    
## elevation.1           -3.246e-02  1.071e-02  -3.031 0.002436 ** 
## EVI.1                 -4.050e-01  4.274e-01  -0.947 0.343434    
## num_odor_complaints.1  5.175e-02  5.954e-03   8.692  < 2e-16 ***
## dist_dc.1              3.003e+03  5.827e+02   5.154 2.55e-07 ***
## avg_temp.1             6.181e-03  4.631e-03   1.335 0.182010    
## avg_hum.1             -2.800e-03  1.439e-03  -1.946 0.051672 .  
## precip.1              -5.810e-02  7.946e-02  -0.731 0.464715    
## (Intercept).2          2.061e-01  1.778e-02  11.589  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                  edf Ref.df Chi.sq p-value    
## s(month)                       5.946  8.000 531.90 < 2e-16 ***
## s.1(month)                     7.743  8.000 372.15 < 2e-16 ***
## s.1(mon_utm_x,mon_utm_y)       4.972  5.554  22.42 0.00041 ***
## te.1(mon_utm_x,mon_utm_y,day) 53.079 80.000 380.71 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Deviance explained =   NA%
## -REML = 7658.9  Scale est. = 1         n = 3904
plot(m1, pages = 1, scheme = 1, scale = 0, seWithMean = TRUE)